library(glmnet)
library(varSelRF)
library(plot3D)
library(FSelector)
library(mlbench)
library(knitr)
library(rgl)
library(R.basic)

I chose to have 3 categories.
Rank Gain -> Black – Improves rank
No Change -> Red
Rank Loss -> Green – Falls behind
Load the data

set.seed(123)
dat <- read.csv("C:/Users/Prashan/Dropbox (MIT)/MIT/Predictive Analytics/code/data/NASCAR_5f/allf_3c_phoenix2_2014_prototype_sel.csv")

Start with 43 features

features <- 3:45 
complete <- which(rowSums(is.na(dat[, features]))==0)

datc <- dat[complete, ]
x <- as.matrix(datc[, features])
y <- datc[,2]
cl <- factor(y)

nascar_data_with_label=datc[,(-1)]
selected_features<-c()

Variable selection from random forests using OOB error

knit_hooks$set(webgl = hook_webgl)
rf.vs1 <- varSelRF(x, cl, ntree = 200, ntreeIterat = 100,
vars.drop.frac = 0.2)
rf.vs1$selected.vars
## [1] "X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"
## [2] "X34..before.pit..25th.percentile.rank.upto.bef.pit"
  if (length(rf.vs1$selected.vars)==3){
        plot3d(x[,rf.vs1$selected.vars[1]],x[,rf.vs1$selected.vars[2]],x[,rf.vs1$selected.vars[3]],col=y,xlab=rf.vs1$selected.vars[1],ylab=rf.vs1$selected.vars[2],zlab=rf.vs1$selected.vars[3])
  }else if (length(rf.vs1$selected.vars)==2){
        plot(x[,rf.vs1$selected.vars[1]],x[,rf.vs1$selected.vars[2]],col=y,xlab=rf.vs1$selected.vars[1],ylab=rf.vs1$selected.vars[2])
  }

# plot(rf.vs1)

Variable selection from CFS filter (https://cran.r-project.org/web/packages/FSelector/FSelector.pdf)

knit_hooks$set(webgl = hook_webgl)
subset <- cfs(rank_change_label~., datc[,(-1)])
plot3d(x[,subset[1]],x[,subset[2]],x[,subset[3]],col=y,xlab=rf.vs1$selected.vars[1],ylab=rf.vs1$selected.vars[2],zlab=rf.vs1$selected.vars[3])

subset
## [1] "X76..after.pit..indicator.2.tire.change..ref.19"                                                       
## [2] "X86..before.pit..indicator.front..lte.8..of.the.pack.pre.pit"                                          
## [3] "X89..before.pit..Indicator.2.tire.AND.bottom..gte.23..of.pack.at.pit.entry...features.76...feature.87."
## [4] "X140..after.pit..Time.in.pits.before.outing"                                                           
## [5] "X160..before.pit..sqrt.of.Binned.avg.prev.slope.HI.RES.133"

Variable selection from Consistency-based filter (https://cran.r-project.org/web/packages/FSelector/FSelector.pdf)

knit_hooks$set(webgl = hook_webgl)
subset <- consistency(rank_change_label~., datc[,(-1)])
plot3d(x[,subset[1]],x[,subset[2]],x[,subset[3]],col=y,xlab=subset[1],ylab=subset[2],zlab=subset[4])
## Warning in persp.default(x = xdummy, y = ydummy, z = zdummy, xlim = xlim, :
## surface extends beyond the box

subset
## [1] "X26..before.pit..incoming.rank.before.pit.bef.leg" 
## [2] "X34..before.pit..25th.percentile.rank.upto.bef.pit"
## [3] "X35..before.pit..75th.percentile.rank.upto.bef.pit"
## [4] "X82..before.pit..starting.position.of.the.car"
selected_features<-c(selected_features,subset)

Variable selection from RReliefF filter (https://cran.r-project.org/web/packages/FSelector/FSelector.pdf)

knit_hooks$set(webgl = hook_webgl)
weights <- relief(rank_change_label~., datc[,(-1)], neighbours.count = 5, sample.size = 20)
subset <- cutoff.k(weights,5)
plot3d(x[,subset[1]],x[,subset[2]],x[,subset[3]],col=y,xlab=subset[1],ylab=subset[2],zlab=subset[3])

subset
## [1] "X77..after.pit..indicator.4.tire.change..ref.19"                        
## [2] "X61..before.pit..NBHD.prev.epoch..number.of.4.pit.types.of.neighborhood"
## [3] "X56..before.pit..NBHD.prev.epoch..median.ranks.of.neighborhood."        
## [4] "X140..after.pit..Time.in.pits.before.outing"                            
## [5] "X60..before.pit..NBHD.prev.epoch..number.of.2.pit.types.of.neighborhood"
selected_features<-c(selected_features,subset)
#RandomForest filter
#1=mean decrease in accuracy, 2=mean decrease in node impurity
# weights <- random.forest.importance(rank_change_label~., datc[,(-1)], importance.type = 2)
# subset <- cutoff.biggest.diff(weights)
# subset

Variable selection from Information Gain (https://cran.r-project.org/web/packages/FSelector/FSelector.pdf)

knit_hooks$set(webgl = hook_webgl)
weights <- information.gain(rank_change_label~., nascar_data_with_label)
subset <- cutoff.biggest.diff(weights)
plot3d(x[,subset[1]],x[,subset[2]],x[,subset[3]],col=y,xlab=subset[1],ylab=subset[2],zlab=subset[3])
## Warning in persp.default(x = xdummy, y = ydummy, z = zdummy, xlim = xlim, :
## surface extends beyond the box

subset
##  [1] "X26..before.pit..incoming.rank.before.pit.bef.leg"                                  
##  [2] "X82..before.pit..starting.position.of.the.car"                                      
##  [3] "X34..before.pit..25th.percentile.rank.upto.bef.pit"                                 
##  [4] "X35..before.pit..75th.percentile.rank.upto.bef.pit"                                 
##  [5] "X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES"              
##  [6] "X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127"         
##  [7] "X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127"       
##  [8] "X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"  
##  [9] "X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"
## [10] "X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"
selected_features<-c(selected_features,subset)
unique_f<-unique(selected_features)
unique_f
##  [1] "X26..before.pit..incoming.rank.before.pit.bef.leg"                                  
##  [2] "X34..before.pit..25th.percentile.rank.upto.bef.pit"                                 
##  [3] "X35..before.pit..75th.percentile.rank.upto.bef.pit"                                 
##  [4] "X82..before.pit..starting.position.of.the.car"                                      
##  [5] "X77..after.pit..indicator.4.tire.change..ref.19"                                    
##  [6] "X61..before.pit..NBHD.prev.epoch..number.of.4.pit.types.of.neighborhood"            
##  [7] "X56..before.pit..NBHD.prev.epoch..median.ranks.of.neighborhood."                    
##  [8] "X140..after.pit..Time.in.pits.before.outing"                                        
##  [9] "X60..before.pit..NBHD.prev.epoch..number.of.2.pit.types.of.neighborhood"            
## [10] "X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES"              
## [11] "X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127"         
## [12] "X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127"       
## [13] "X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"  
## [14] "X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"
## [15] "X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"

lasso with a multinomial loss. Here we see that even with maxit=1000000 there is no convergence.

train <- sample(nrow(x), round(nrow(x) * 0.7))
cvfit <- cv.glmnet(x[train, ], y[train], family="multinomial",maxit=1000000)
## Warning: from glmnet Fortran code (error code -72); Convergence for 72th
## lambda value not reached after maxit=1000000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -76); Convergence for 76th
## lambda value not reached after maxit=1000000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -91); Convergence for 91th
## lambda value not reached after maxit=1000000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -79); Convergence for 79th
## lambda value not reached after maxit=1000000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -77); Convergence for 77th
## lambda value not reached after maxit=1000000 iterations; solutions for
## larger lambdas returned
plot(cvfit) # let's look at the cross validated error to choose lambda

# coef(cvfit, s="lambda.1se")
yhat <- predict(cvfit, newx=x[-train, ], s="lambda.1se", type="class")
mean(y[-train] != yhat) # classification error
## [1] 0.5833333
table(y[-train], yhat) # confusion matrix
##    yhat
##      1
##   1 20
##   2 11
##   3 17

We look at the pairwise plots of the features selected by the various algorithms.

# pairs(x, pch="o", col=y, lower.panel=NULL)
pairs(x[,unique(selected_features)], pch="o", col=y, lower.panel=NULL)

We can simplify this picture by plotting the data against just the first two principal components:

# pca <- prcomp(x, scale=TRUE)
# plot(pca$x, col=y, pch=20)

Using only the selected subset of unique features

pca <- prcomp(x[,unique(selected_features)], scale=TRUE)
plot(pca$x, col=y, pch=20)

The first two principal component directions represent the following linear combinations of nutrients:

pca$rotation[,1:2]
##                                                                                             PC1
## X26..before.pit..incoming.rank.before.pit.bef.leg                                    0.32375470
## X34..before.pit..25th.percentile.rank.upto.bef.pit                                   0.32943558
## X35..before.pit..75th.percentile.rank.upto.bef.pit                                   0.32871785
## X82..before.pit..starting.position.of.the.car                                        0.31609669
## X77..after.pit..indicator.4.tire.change..ref.19                                      0.12114162
## X61..before.pit..NBHD.prev.epoch..number.of.4.pit.types.of.neighborhood              0.10706984
## X56..before.pit..NBHD.prev.epoch..median.ranks.of.neighborhood.                      0.11230882
## X140..after.pit..Time.in.pits.before.outing                                          0.08815530
## X60..before.pit..NBHD.prev.epoch..number.of.2.pit.types.of.neighborhood             -0.08889812
## X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES                0.33384930
## X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127           0.32024680
## X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127         0.30094928
## X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137    0.32826242
## X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137  0.32329384
## X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137     0.08600491
##                                                                                               PC2
## X26..before.pit..incoming.rank.before.pit.bef.leg                                   -0.0201284062
## X34..before.pit..25th.percentile.rank.upto.bef.pit                                  -0.0530322125
## X35..before.pit..75th.percentile.rank.upto.bef.pit                                  -0.0124555595
## X82..before.pit..starting.position.of.the.car                                       -0.0181138726
## X77..after.pit..indicator.4.tire.change..ref.19                                      0.5855772347
## X61..before.pit..NBHD.prev.epoch..number.of.4.pit.types.of.neighborhood             -0.4787802098
## X56..before.pit..NBHD.prev.epoch..median.ranks.of.neighborhood.                      0.0030155914
## X140..after.pit..Time.in.pits.before.outing                                          0.5744125430
## X60..before.pit..NBHD.prev.epoch..number.of.2.pit.types.of.neighborhood              0.2885625126
## X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES               -0.0151548930
## X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127          -0.0076419610
## X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127        -0.0009078096
## X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137   -0.0107298101
## X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137 -0.0260680148
## X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137     0.0992836321
# cereal <- y == levels(y)[1]
# ii <- cereal & (pca$x[, 1]>3)
# datc$Shrt_Desc[ii]
# ```
# 
# Notice these cereals all have "W/ H20" mentioned.  That is consistent with the fact that PC1 has a large positive value for the Water feature.
# 
# And what about that odd-ball cereal in the lower left?  That's `r datc$Shrt_Desc[(y==levels(y)[1]) & (pca$x[, 2] < -4)]`.
# 
# This doesn't ring any bells, so let's look at its composition:
# ```{r}
# x[cereal & (pca$x[, 2] < -4), ]
# ```
# 
# That calcium value is quite high, and also 3333 looks like it could be coding a missing value or something (since I think 999 is sometimes used for this), however I can't find any references to 3333 in the documentation for the data.  And plotting the Calcium values, it doesn't look that crazy, especially after a log transformation:
# ```{r}
# # boxplot(x[cereal,"Calcium_.mg."], main="Calcium values")
# # hist(log(1+x[cereal,"Calcium_.mg."]), main="log(Calcium values)")