library(glmnet)
library(varSelRF)
library(FSelector)
library(mlbench)
library(plot3D)
library(knitr)
library(rgl)
set.seed(2)
knit_hooks$set(webgl = hook_webgl)

We chose to have 3 categories.
Gain in rank -> Black – The car improves rank
No Change in rank -> Red
A Loss in rank -> Green – The racer falls behind
Load the data

dat <- read.csv("C:/Users/Prashan/Dropbox (MIT)/MIT/Predictive Analytics/code/data/NASCAR_5f/allf_3c_phoenix2_2014_prototype_sel.csv")

Start with 43 features, create the x-attributes, y-label matrices

features <- 3:45 
complete <- which(rowSums(is.na(dat[, features]))==0)
datc <- dat[complete, ]
x <- as.matrix(datc[, features])
y <- datc[,2]
cl <- factor(y)

#column one contains the names of the data points (i.e cereal-captain-crunch..)
#take out the data point names and leave only the label and attributes
nascar_data_with_label=datc[,(-1)]
selected_features<-c()
  1. Variable selection from random forests using OOB error (https://cran.r-project.org/web/packages/varSelRF/varSelRF.pdf)
rf.vs1 <- varSelRF(x, cl, ntree = 200, ntreeIterat = 100,
vars.drop.frac = 0.2)

selected variables

rf.vs1$selected.vars
## [1] "X128..before.pit...Binned.avg.pre.slope..prev.epoch.avg.rank..new"               
## [2] "X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"
## [3] "X26..before.pit..incoming.rank.before.pit.bef.leg"                               
## [4] "X35..before.pit..75th.percentile.rank.upto.bef.pit"
  if (length(rf.vs1$selected.vars)==3){
        plot3d(x[,rf.vs1$selected.vars[1]],x[,rf.vs1$selected.vars[2]],x[,rf.vs1$selected.vars[3]],col=y,xlab=rf.vs1$selected.vars[1],ylab=rf.vs1$selected.vars[2],zlab=rf.vs1$selected.vars[3])
  }else if (length(rf.vs1$selected.vars)==2){
        plot(x[,rf.vs1$selected.vars[1]],x[,rf.vs1$selected.vars[2]],col=y,xlab=rf.vs1$selected.vars[1],ylab=rf.vs1$selected.vars[2])
  }
# plot(rf.vs1)
  1. Variable selection from CFS filter (https://cran.r-project.org/web/packages/FSelector/FSelector.pdf)
subset <- cfs(rank_change_label~., datc[,(-1)])

selected variables

subset
## [1] "X76..after.pit..indicator.2.tire.change..ref.19"                                                       
## [2] "X86..before.pit..indicator.front..lte.8..of.the.pack.pre.pit"                                          
## [3] "X89..before.pit..Indicator.2.tire.AND.bottom..gte.23..of.pack.at.pit.entry...features.76...feature.87."
## [4] "X140..after.pit..Time.in.pits.before.outing"                                                           
## [5] "X160..before.pit..sqrt.of.Binned.avg.prev.slope.HI.RES.133"
plot3d(x[,subset[1]],x[,subset[2]],x[,subset[3]],col=y,xlab=subset[1],ylab=subset[2],zlab=subset[3])

unnamed_chunk_6snapshot
You must enable Javascript to view this page properly.

  1. Variable selection from Consistency-based filter (https://cran.r-project.org/web/packages/FSelector/FSelector.pdf)
subset <- consistency(rank_change_label~., datc[,(-1)])

selected variables

subset
## [1] "X26..before.pit..incoming.rank.before.pit.bef.leg" 
## [2] "X34..before.pit..25th.percentile.rank.upto.bef.pit"
## [3] "X35..before.pit..75th.percentile.rank.upto.bef.pit"
## [4] "X82..before.pit..starting.position.of.the.car"
plot3d(x[,subset[1]],x[,subset[2]],x[,subset[3]],col=y,xlab=subset[1],ylab=subset[2],zlab=subset[3])
selected_features<-c(selected_features,subset)

unnamed_chunk_8snapshot
You must enable Javascript to view this page properly.

  1. Variable selection from RReliefF filter (https://cran.r-project.org/web/packages/FSelector/FSelector.pdf)
weights <- relief(rank_change_label~., datc[,(-1)], neighbours.count = 5, sample.size = 20)
subset <- cutoff.k(weights,5)

selected variables

subset
## [1] "X26..before.pit..incoming.rank.before.pit.bef.leg"  
## [2] "X166..after.pit.total_tire_age.at.pit"              
## [3] "X20..after.pit..race.variant..age.of.LS.tyre.at.pit"
## [4] "X21..after.pit..race.variant..age.of.RS.tyre.at.pit"
## [5] "X34..before.pit..25th.percentile.rank.upto.bef.pit"
plot3d(x[,subset[1]],x[,subset[2]],x[,subset[3]],col=y,xlab=subset[1],ylab=subset[2],zlab=subset[3])
selected_features<-c(selected_features,subset)

unnamed_chunk_10snapshot
You must enable Javascript to view this page properly.

  1. Variable selection from Information Gain (https://cran.r-project.org/web/packages/FSelector/FSelector.pdf)
weights <- information.gain(rank_change_label~., nascar_data_with_label)
subset <- cutoff.biggest.diff(weights)

selected variables

subset
##  [1] "X26..before.pit..incoming.rank.before.pit.bef.leg"                                  
##  [2] "X82..before.pit..starting.position.of.the.car"                                      
##  [3] "X34..before.pit..25th.percentile.rank.upto.bef.pit"                                 
##  [4] "X35..before.pit..75th.percentile.rank.upto.bef.pit"                                 
##  [5] "X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES"              
##  [6] "X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127"         
##  [7] "X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127"       
##  [8] "X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"  
##  [9] "X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"
## [10] "X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"
plot3d(x[,subset[1]],x[,subset[2]],x[,subset[3]],col=y,xlab=subset[1],ylab=subset[2],zlab=subset[3])
selected_features<-c(selected_features,subset)

unnamed_chunk_12snapshot
You must enable Javascript to view this page properly.

After 5 feature selection algorithms we list down the subset of all the features that were selected

unique_f<-unique(selected_features)
unique_f
##  [1] "X26..before.pit..incoming.rank.before.pit.bef.leg"                                  
##  [2] "X34..before.pit..25th.percentile.rank.upto.bef.pit"                                 
##  [3] "X35..before.pit..75th.percentile.rank.upto.bef.pit"                                 
##  [4] "X82..before.pit..starting.position.of.the.car"                                      
##  [5] "X166..after.pit.total_tire_age.at.pit"                                              
##  [6] "X20..after.pit..race.variant..age.of.LS.tyre.at.pit"                                
##  [7] "X21..after.pit..race.variant..age.of.RS.tyre.at.pit"                                
##  [8] "X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES"              
##  [9] "X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127"         
## [10] "X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127"       
## [11] "X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"  
## [12] "X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"
## [13] "X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"
subset_x=x[,unique_f]

lasso with a multinomial loss. Here we train on the full set of features and see that even with maxit=1000000 there is no convergence.

train <- sample(nrow(x[]), round(nrow(x) * 0.7))
cvfit <- cv.glmnet(x[train, ], y[train], family="multinomial",maxit=1000000)
## Warning: from glmnet Fortran code (error code -70); Convergence for 70th
## lambda value not reached after maxit=1000000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -70); Convergence for 70th
## lambda value not reached after maxit=1000000 iterations; solutions for
## larger lambdas returned
plot(cvfit) # let's look at the cross validated error to choose lambda

unnamed_chunk_14snapshot
You must enable Javascript to view this page properly.

We train on the data with a subset of features

train <- sample(nrow(subset_x), round(nrow(subset_x) * 0.7))
cvfit <- cv.glmnet(subset_x[train, ], y[train], family="multinomial",maxit=1000000)
plot(cvfit) # let's look at the cross validated error to choose lambda

unnamed_chunk_15snapshot
You must enable Javascript to view this page properly.

coef(cvfit, s="lambda.1se")
## $`1`
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                                                                                             1
## (Intercept)                                                                         0.5654831
## X26..before.pit..incoming.rank.before.pit.bef.leg                                   .        
## X34..before.pit..25th.percentile.rank.upto.bef.pit                                  .        
## X35..before.pit..75th.percentile.rank.upto.bef.pit                                  .        
## X82..before.pit..starting.position.of.the.car                                       .        
## X166..after.pit.total_tire_age.at.pit                                               .        
## X20..after.pit..race.variant..age.of.LS.tyre.at.pit                                 .        
## X21..after.pit..race.variant..age.of.RS.tyre.at.pit                                 .        
## X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES               .        
## X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127          .        
## X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127        .        
## X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137   .        
## X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137 .        
## X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137    .        
## 
## $`2`
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                                                                                              1
## (Intercept)                                                                         -0.5331292
## X26..before.pit..incoming.rank.before.pit.bef.leg                                    .        
## X34..before.pit..25th.percentile.rank.upto.bef.pit                                   .        
## X35..before.pit..75th.percentile.rank.upto.bef.pit                                   .        
## X82..before.pit..starting.position.of.the.car                                        .        
## X166..after.pit.total_tire_age.at.pit                                                .        
## X20..after.pit..race.variant..age.of.LS.tyre.at.pit                                  .        
## X21..after.pit..race.variant..age.of.RS.tyre.at.pit                                  .        
## X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES                .        
## X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127           .        
## X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127         .        
## X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137    .        
## X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137  .        
## X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137     .        
## 
## $`3`
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                                                                                              1
## (Intercept)                                                                         -0.0323539
## X26..before.pit..incoming.rank.before.pit.bef.leg                                    .        
## X34..before.pit..25th.percentile.rank.upto.bef.pit                                   .        
## X35..before.pit..75th.percentile.rank.upto.bef.pit                                   .        
## X82..before.pit..starting.position.of.the.car                                        .        
## X166..after.pit.total_tire_age.at.pit                                                .        
## X20..after.pit..race.variant..age.of.LS.tyre.at.pit                                  .        
## X21..after.pit..race.variant..age.of.RS.tyre.at.pit                                  .        
## X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES                .        
## X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127           .        
## X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127         .        
## X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137    .        
## X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137  .        
## X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137     .
yhat <- predict(cvfit, newx=subset_x[-train, ], s="lambda.1se", type="class")
mean(y[-train] != yhat) # classification error
## [1] 0.6041667
table(y[-train], yhat) # confusion matrix
##    yhat
##      1
##   1 19
##   2 10
##   3 19

We look at the pairwise plots of the final subset of features selected by the 5 feature selectio algorithms.

pairs(subset_x, pch="o", col=y, lower.panel=NULL)

We do PCA Using only the selected subset of features

pca <- prcomp(subset_x, scale=TRUE)
plot(pca$x, col=y, pch=20)

The first two principal component directions represent the following linear combinations of sub selected features:

pca$rotation[,1:2]
##                                                                                             PC1
## X26..before.pit..incoming.rank.before.pit.bef.leg                                    0.32291282
## X34..before.pit..25th.percentile.rank.upto.bef.pit                                   0.32809403
## X35..before.pit..75th.percentile.rank.upto.bef.pit                                   0.32522720
## X82..before.pit..starting.position.of.the.car                                        0.31280266
## X166..after.pit.total_tire_age.at.pit                                               -0.12616757
## X20..after.pit..race.variant..age.of.LS.tyre.at.pit                                 -0.12409582
## X21..after.pit..race.variant..age.of.RS.tyre.at.pit                                 -0.11763566
## X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES                0.33493064
## X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127           0.32632219
## X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127         0.31194482
## X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137    0.32906598
## X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137  0.32536051
## X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137     0.09585955
##                                                                                              PC2
## X26..before.pit..incoming.rank.before.pit.bef.leg                                   -0.069890375
## X34..before.pit..25th.percentile.rank.upto.bef.pit                                  -0.082235700
## X35..before.pit..75th.percentile.rank.upto.bef.pit                                  -0.116791434
## X82..before.pit..starting.position.of.the.car                                       -0.106712884
## X166..after.pit.total_tire_age.at.pit                                               -0.573943924
## X20..after.pit..race.variant..age.of.LS.tyre.at.pit                                 -0.549735917
## X21..after.pit..race.variant..age.of.RS.tyre.at.pit                                 -0.552191298
## X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES               -0.085562335
## X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127          -0.036123278
## X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127         0.009582623
## X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137   -0.085313061
## X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137 -0.078796422
## X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137     0.067701442