set.seed(100)
setwd("~/GitHub/Ruby_Talk_Material")
library(caret)
## Warning: package 'caret' was built under R version 3.2.5
## Loading required package: lattice
## Loading required package: ggplot2
trainData <- read.csv('train.csv',sep=',', header=TRUE)
testData <- read.csv('test.csv',sep=',', header=TRUE)

trainData$ARR_DEL15 <- as.factor(trainData$ARR_DEL15)
testData$ARR_DEL15 <- as.factor(testData$ARR_DEL15)
trainData$DAY_OF_WEEK <- as.factor(trainData$DAY_OF_WEEK)
testData$DAY_OF_WEEK <- as.factor(testData$DAY_OF_WEEK)
trainData$X <- NULL
testData$X <- NULL

Now we train the model. Use a rather simple algorith first to do the classification. Then if performence not that good, go to ensemble algorithms which are usually better. Even better would be to select more important variables from the data, include additional predictor variables, or do feature-engineering.

Choose Logistic regression to start with. Basically a regression that predicts a binary value.

library(caret)
logisticRegModel <- train(ARR_DEL15 ~ ., data=trainData, method = 'glm', family = 'binomial') #the dot here stands for 'all available variables, i.e. all columns', glm is generalized linear regression, we want logistic regression, i.e. set family to binomial

Now we can use the model and the test data to check how well we predict flight arrival delays.

logRegPrediction <- predict(logisticRegModel, testData)
logRegConfMat <- confusionMatrix(logRegPrediction, testData[,"ARR_DEL15"])
logRegConfMat
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7465 2273
##          1   65   94
##                                           
##                Accuracy : 0.7638          
##                  95% CI : (0.7553, 0.7721)
##     No Information Rate : 0.7608          
##     P-Value [Acc > NIR] : 0.2513          
##                                           
##                   Kappa : 0.0457          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.99137         
##             Specificity : 0.03971         
##          Pos Pred Value : 0.76658         
##          Neg Pred Value : 0.59119         
##              Prevalence : 0.76084         
##          Detection Rate : 0.75427         
##    Detection Prevalence : 0.98393         
##       Balanced Accuracy : 0.51554         
##                                           
##        'Positive' Class : 0               
## 

Specificity is really low. Improve model. See what s available with names(getModelInfo()) and then try boosted tree model gbm: see http://topepo.github.io/caret/training.html

fitControl <- trainControl(method = 'repeatedcv', number = 10, repeats = 10)
gbmFit1 <- train(ARR_DEL15 ~ ., data=trainData, method = 'gbm',trControl = fitControl,verbose = FALSE)
## Loading required package: gbm
## Warning: package 'gbm' was built under R version 3.2.5
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
## Loading required package: plyr
gbmFit1
## Stochastic Gradient Boosting 
## 
## 23097 samples
##     5 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 20786, 20788, 20788, 20786, 20788, 20788, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa      
##   1                   50      0.7608348  0.000000000
##   1                  100      0.7609950  0.001187878
##   1                  150      0.7617180  0.006800028
##   2                   50      0.7639348  0.025748410
##   2                  100      0.7666797  0.050503575
##   2                  150      0.7677405  0.063563826
##   3                   50      0.7661862  0.049266558
##   3                  100      0.7677058  0.066772542
##   3                  150      0.7684808  0.077685240
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were n.trees = 150,
##  interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
plot(gbmFit1)

plot(gbmFit1, metric = "Kappa")

gbmPrediction <- predict(gbmFit1, testData)
gbmConfMat <- confusionMatrix(gbmPrediction, testData[,"ARR_DEL15"])
gbmConfMat
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7449 2197
##          1   81  170
##                                           
##                Accuracy : 0.7698          
##                  95% CI : (0.7614, 0.7781)
##     No Information Rate : 0.7608          
##     P-Value [Acc > NIR] : 0.0182          
##                                           
##                   Kappa : 0.088           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.98924         
##             Specificity : 0.07182         
##          Pos Pred Value : 0.77224         
##          Neg Pred Value : 0.67729         
##              Prevalence : 0.76084         
##          Detection Rate : 0.75265         
##    Detection Prevalence : 0.97464         
##       Balanced Accuracy : 0.53053         
##                                           
##        'Positive' Class : 0               
##