As a baseline, let us fit a lasso with a multinomial loss (since we have three categories: Cereals, Snacks, and Sweets). We use cross validation to select the tuning parameter.

library(glmnet)
library(varSelRF)
library(plot3D)
library(FSelector)
library(mlbench)
library(knitr)
library(rgl)
set.seed(123)

# dat <- read.csv("C:/Users/Prashan/Dropbox (MIT)/MIT/Predictive Analytics/code/data/d4_650_v4/fold1/train/fold_1_train.csv")
# dat <- read.csv("C:/Users/Prashan/Dropbox (MIT)/MIT/Predictive Analytics/code/data/NASCAR_5f/fold1/train/fold_1_train.csv")
dat <- read.csv("C:/Users/Prashan/Dropbox (MIT)/MIT/Predictive Analytics/code/data/NASCAR_5f/allf_3c_phoenix2_2014_prototype_sel.csv")

# features <- c(15,45,32) #3:45 #c(12,13)  #c(10,30,118,9,28)  3:45 c(4,30,43,7,9,10,25)
features <- 3:45 #c(36,37,38,43,7,9,10,16,20)#c(12,32,45)
complete <- which(rowSums(is.na(dat[, features]))==0)

datc <- dat[complete, ]
x <- as.matrix(datc[, features])
y <- datc[,2]
cl <- factor(y)

nascar_data_with_label=datc[,(-1)]

Variable selection from random forests using OOB error

knit_hooks$set(webgl = hook_webgl)
rf.vs1 <- varSelRF(x, cl, ntree = 200, ntreeIterat = 100,
vars.drop.frac = 0.2)
rf.vs1$selected.vars
## [1] "X128..before.pit...Binned.avg.pre.slope..prev.epoch.avg.rank..new"
## [2] "X26..before.pit..incoming.rank.before.pit.bef.leg"                
## [3] "X35..before.pit..75th.percentile.rank.upto.bef.pit"
plot3d(x[,rf.vs1$selected.vars[1]],x[,rf.vs1$selected.vars[2]],x[,rf.vs1$selected.vars[3]],col=y,xlab=rf.vs1$selected.vars[1],ylab=rf.vs1$selected.vars[2],zlab=rf.vs1$selected.vars[3])
plot(rf.vs1)

unnamed_chunk_2snapshot
You must enable Javascript to view this page properly.

CFS filter

knit_hooks$set(webgl = hook_webgl)
subset <- cfs(rank_change_label~., datc[,(-1)])
plot3d(x[,rf.vs1$selected.vars[1]],x[,rf.vs1$selected.vars[2]],x[,rf.vs1$selected.vars[3]],col=y,xlab=rf.vs1$selected.vars[1],ylab=rf.vs1$selected.vars[2],zlab=rf.vs1$selected.vars[3])
subset
## [1] "X76..after.pit..indicator.2.tire.change..ref.19"             
## [2] "X86..before.pit..indicator.front..lte.8..of.the.pack.pre.pit"
## [3] "X140..after.pit..Time.in.pits.before.outing"                 
## [4] "X160..before.pit..sqrt.of.Binned.avg.prev.slope.HI.RES.133"

unnamed_chunk_3snapshot
You must enable Javascript to view this page properly.

Consistency-based filter

knit_hooks$set(webgl = hook_webgl)
subset <- consistency(rank_change_label~., datc[,(-1)])
plot3d(x[,subset[1]],x[,subset[2]],x[,subset[3]],col=y,xlab=subset[1],ylab=subset[2],zlab=subset[4])
subset
## [1] "X26..before.pit..incoming.rank.before.pit.bef.leg" 
## [2] "X34..before.pit..25th.percentile.rank.upto.bef.pit"
## [3] "X35..before.pit..75th.percentile.rank.upto.bef.pit"
## [4] "X82..before.pit..starting.position.of.the.car"

unnamed_chunk_4snapshot
You must enable Javascript to view this page properly.

RReliefF filter

knit_hooks$set(webgl = hook_webgl)
weights <- relief(rank_change_label~., datc[,(-1)], neighbours.count = 5, sample.size = 20)
subset <- cutoff.k(weights,5)
plot3d(x[,subset[1]],x[,subset[2]],x[,subset[3]],col=y,xlab=subset[1],ylab=subset[2],zlab=subset[3])
subset
## [1] "X82..before.pit..starting.position.of.the.car"                                      
## [2] "X56..before.pit..NBHD.prev.epoch..median.ranks.of.neighborhood."                    
## [3] "X77..after.pit..indicator.4.tire.change..ref.19"                                    
## [4] "X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127"       
## [5] "X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"
#RandomForest filter
#1=mean decrease in accuracy, 2=mean decrease in node impurity
# weights <- random.forest.importance(rank_change_label~., datc[,(-1)], importance.type = 2)
# subset <- cutoff.biggest.diff(weights)
# subset

unnamed_chunk_5snapshot
You must enable Javascript to view this page properly.

Cutoffs

knit_hooks$set(webgl = hook_webgl)
weights <- information.gain(rank_change_label~., nascar_data_with_label)
subset <- cutoff.biggest.diff(weights)
plot3d(x[,subset[1]],x[,subset[2]],x[,subset[3]],col=y,xlab=subset[1],ylab=subset[2],zlab=subset[3])
subset
## [1] "X26..before.pit..incoming.rank.before.pit.bef.leg"                           
## [2] "X82..before.pit..starting.position.of.the.car"                               
## [3] "X34..before.pit..25th.percentile.rank.upto.bef.pit"                          
## [4] "X35..before.pit..75th.percentile.rank.upto.bef.pit"                          
## [5] "X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127"  
## [6] "X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127"

unnamed_chunk_6snapshot
You must enable Javascript to view this page properly.

lasso with a multinomial loss

train <- sample(nrow(x), round(nrow(x) * 0.7))
cvfit <- cv.glmnet(x[train, ], y[train], family="multinomial",maxit=1000000)
## Warning: from glmnet Fortran code (error code -93); Convergence for 93th
## lambda value not reached after maxit=1000000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -69); Convergence for 69th
## lambda value not reached after maxit=1000000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -83); Convergence for 83th
## lambda value not reached after maxit=1000000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -76); Convergence for 76th
## lambda value not reached after maxit=1000000 iterations; solutions for
## larger lambdas returned
plot(cvfit) # let's look at the cross validated error to choose lambda

The curve shows the cross-validated performance of the lasso as a function of the tuning parameter. The second vertical line indicates the value of \(\lambda\) we select based on cross validation. Let us look at the coefficients of the fitted model:

# coef(cvfit, s="lambda.1se")

The coefficient values are intuitive: having a higher level of iron or a lower level of fat are predictive of being a cereal; snacks are pretty similar to the baseline in all respects; and sweets are lower in protein and carbs, but higher in sugar and cholesterol. Finally, observe that water is not a useful feature for distinguishing any of these three classes.

We move on to examine the out-of-sample predictive performance of this model:

yhat <- predict(cvfit, newx=x[-train, ], s="lambda.1se", type="class")
mean(y[-train] != yhat) # classification error
## [1] 0.4693878
table(y[-train], yhat) # confusion matrix
##    yhat
##      1
##   1 26
##   2 10
##   3 13

Every row corresponds to the true label of a food in the test set and every column corresponds to a predicted label. We see that Snacks are the most difficult to classify of the three food groups (which makes intuitive sense).

Let us look at the data a bit. We plot all pairs of features against each other, coloring Cereals black, Snacks red, and Sweets green:

pairs(x, pch="o", col=y, lower.panel=NULL)

We can simplify this picture by plotting the data against just the first two principal components:

pca <- prcomp(x, scale=TRUE)
plot(pca$x, col=y, pch=20)

The first two principal component directions represent the following linear combinations of nutrients:

pca$rotation[,1:2]
##                                                                                                                 PC1
## X9..before.pit..indicator.not.first.leg.of.race                                                         0.018430316
## X13..before.pit..average.prev.rank.2                                                                    0.221332592
## X20..after.pit..race.variant..age.of.LS.tyre.at.pit                                                    -0.036239648
## X21..after.pit..race.variant..age.of.RS.tyre.at.pit                                                    -0.028977008
## X26..before.pit..incoming.rank.before.pit.bef.leg                                                       0.261608658
## X28..before.pit..indicator.pit.in.caution..1.true...indicates.traffic.at.the.start.of.outing            0.014743273
## X34..before.pit..25th.percentile.rank.upto.bef.pit                                                      0.265692188
## X35..before.pit..75th.percentile.rank.upto.bef.pit                                                      0.270147724
## X37..before.pit..prev.min.change.in.rank.over.all.epochs                                               -0.009518722
## X40..before.pit..average.prev..rate.of.change.in.rank                                                  -0.008856162
## X41..before.pit..immediate.prev.rate.of.change.in.rank                                                 -0.010397667
## X43..before.pit..prev.min..change.in.rank.rank.                                                        -0.059090099
## X44..before.pit..average.prev..change.in.rank.rank.                                                    -0.020703471
## X45..before.pit..immediate.prev..change.in.rank.rank.                                                  -0.020311536
## X56..before.pit..NBHD.prev.epoch..median.ranks.of.neighborhood.                                         0.126763883
## X59..before.pit..NBHD.prev.epoch..number.of.0.pit.types.of.neighborhood                                 0.104233006
## X60..before.pit..NBHD.prev.epoch..number.of.2.pit.types.of.neighborhood                                -0.057817178
## X61..before.pit..NBHD.prev.epoch..number.of.4.pit.types.of.neighborhood                                 0.122481092
## X69..before.pit..NBHD.prev.epoch..Normalized.median.age.of.LS.tyres.at.pit.of.neighborhood..58         -0.030149315
## X75..before.pit..race.variant..cumulative.time.in.pits.upto.pit.entry                                   0.047999273
## X76..after.pit..indicator.2.tire.change..ref.19                                                        -0.128956376
## X77..after.pit..indicator.4.tire.change..ref.19                                                         0.087152532
## X82..before.pit..starting.position.of.the.car                                                           0.256898510
## X86..before.pit..indicator.front..lte.8..of.the.pack.pre.pit                                           -0.019501454
## X89..before.pit..Indicator.2.tire.AND.bottom..gte.23..of.pack.at.pit.entry...features.76...feature.87. -0.128956376
## X128..before.pit...Binned.avg.pre.slope..prev.epoch.avg.rank..new                                       0.245953287
## X133..before.pit..Binned.avg.prev.slope.HI.RES                                                         -0.038580257
## X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES                                   0.263337252
## X139..after.pit...Binned.avg.pre.slope..pit.exit.rank.HI.RES                                            0.223910509
## X140..after.pit..Time.in.pits.before.outing                                                             0.029455596
## X146..before.pit..norm.immediate.prev.slope...immediate.prev.init.rank.2                                0.219439153
## X149..before.pit..exp.of.norm.immediate.prev.slope                                                     -0.016436347
## X150..before.pit..sqrt.of.immediate.prev.avg.rank                                                       0.155565895
## X151..before.pit..square.of.immediate.prev.avg.rank                                                     0.221218404
## X153..before.pit..sqrt.of.avg.prev.rank.12                                                              0.151454692
## X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127                              0.246329094
## X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127                            0.224253521
## X160..before.pit..sqrt.of.Binned.avg.prev.slope.HI.RES.133                                             -0.036123283
## X161..before.pit..square.of.Binned.avg.prev.slope.HI.RES.133                                           -0.042059571
## X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137                       0.259247148
## X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137                     0.254349047
## X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137                        0.044829019
## X166..after.pit.total_tire_age.at.pit                                                                  -0.034230226
##                                                                                                                 PC2
## X9..before.pit..indicator.not.first.leg.of.race                                                         0.307573750
## X13..before.pit..average.prev.rank.2                                                                    0.152308918
## X20..after.pit..race.variant..age.of.LS.tyre.at.pit                                                     0.249663580
## X21..after.pit..race.variant..age.of.RS.tyre.at.pit                                                     0.256620219
## X26..before.pit..incoming.rank.before.pit.bef.leg                                                      -0.049889099
## X28..before.pit..indicator.pit.in.caution..1.true...indicates.traffic.at.the.start.of.outing            0.303553737
## X34..before.pit..25th.percentile.rank.upto.bef.pit                                                     -0.051093057
## X35..before.pit..75th.percentile.rank.upto.bef.pit                                                     -0.023475620
## X37..before.pit..prev.min.change.in.rank.over.all.epochs                                               -0.115275825
## X40..before.pit..average.prev..rate.of.change.in.rank                                                  -0.053712068
## X41..before.pit..immediate.prev.rate.of.change.in.rank                                                 -0.043503991
## X43..before.pit..prev.min..change.in.rank.rank.                                                        -0.111772899
## X44..before.pit..average.prev..change.in.rank.rank.                                                    -0.059416271
## X45..before.pit..immediate.prev..change.in.rank.rank.                                                  -0.042470112
## X56..before.pit..NBHD.prev.epoch..median.ranks.of.neighborhood.                                         0.153646271
## X59..before.pit..NBHD.prev.epoch..number.of.0.pit.types.of.neighborhood                                 0.053709146
## X60..before.pit..NBHD.prev.epoch..number.of.2.pit.types.of.neighborhood                                 0.140802184
## X61..before.pit..NBHD.prev.epoch..number.of.4.pit.types.of.neighborhood                                 0.169706730
## X69..before.pit..NBHD.prev.epoch..Normalized.median.age.of.LS.tyres.at.pit.of.neighborhood..58          0.177585025
## X75..before.pit..race.variant..cumulative.time.in.pits.upto.pit.entry                                   0.081004189
## X76..after.pit..indicator.2.tire.change..ref.19                                                         0.131493226
## X77..after.pit..indicator.4.tire.change..ref.19                                                        -0.143179553
## X82..before.pit..starting.position.of.the.car                                                          -0.030733130
## X86..before.pit..indicator.front..lte.8..of.the.pack.pre.pit                                           -0.306261375
## X89..before.pit..Indicator.2.tire.AND.bottom..gte.23..of.pack.at.pit.entry...features.76...feature.87.  0.131493226
## X128..before.pit...Binned.avg.pre.slope..prev.epoch.avg.rank..new                                      -0.097494490
## X133..before.pit..Binned.avg.prev.slope.HI.RES                                                         -0.049409976
## X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES                                  -0.060821170
## X139..after.pit...Binned.avg.pre.slope..pit.exit.rank.HI.RES                                           -0.025175945
## X140..after.pit..Time.in.pits.before.outing                                                            -0.007804419
## X146..before.pit..norm.immediate.prev.slope...immediate.prev.init.rank.2                                0.140382133
## X149..before.pit..exp.of.norm.immediate.prev.slope                                                      0.248065980
## X150..before.pit..sqrt.of.immediate.prev.avg.rank                                                       0.251164707
## X151..before.pit..square.of.immediate.prev.avg.rank                                                     0.148980848
## X153..before.pit..sqrt.of.avg.prev.rank.12                                                              0.256047858
## X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127                             -0.085861648
## X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127                           -0.114831317
## X160..before.pit..sqrt.of.Binned.avg.prev.slope.HI.RES.133                                             -0.047730373
## X161..before.pit..square.of.Binned.avg.prev.slope.HI.RES.133                                           -0.051932129
## X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137                      -0.055727867
## X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137                    -0.065031516
## X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137                       -0.081407303
## X166..after.pit.total_tire_age.at.pit                                                                   0.263380139

What is going on with that tight cluster of cereals (black points) with a high value of the first PC?

cereal <- y == levels(y)[1]
ii <- cereal & (pca$x[, 1]>3)
datc$Shrt_Desc[ii]
## NULL

Notice these cereals all have “W/ H20” mentioned. That is consistent with the fact that PC1 has a large positive value for the Water feature.

And what about that odd-ball cereal in the lower left? That’s .

This doesn’t ring any bells, so let’s look at its composition:

x[cereal & (pca$x[, 2] < -4), ]
##      X9..before.pit..indicator.not.first.leg.of.race
##      X13..before.pit..average.prev.rank.2
##      X20..after.pit..race.variant..age.of.LS.tyre.at.pit
##      X21..after.pit..race.variant..age.of.RS.tyre.at.pit
##      X26..before.pit..incoming.rank.before.pit.bef.leg
##      X28..before.pit..indicator.pit.in.caution..1.true...indicates.traffic.at.the.start.of.outing
##      X34..before.pit..25th.percentile.rank.upto.bef.pit
##      X35..before.pit..75th.percentile.rank.upto.bef.pit
##      X37..before.pit..prev.min.change.in.rank.over.all.epochs
##      X40..before.pit..average.prev..rate.of.change.in.rank
##      X41..before.pit..immediate.prev.rate.of.change.in.rank
##      X43..before.pit..prev.min..change.in.rank.rank.
##      X44..before.pit..average.prev..change.in.rank.rank.
##      X45..before.pit..immediate.prev..change.in.rank.rank.
##      X56..before.pit..NBHD.prev.epoch..median.ranks.of.neighborhood.
##      X59..before.pit..NBHD.prev.epoch..number.of.0.pit.types.of.neighborhood
##      X60..before.pit..NBHD.prev.epoch..number.of.2.pit.types.of.neighborhood
##      X61..before.pit..NBHD.prev.epoch..number.of.4.pit.types.of.neighborhood
##      X69..before.pit..NBHD.prev.epoch..Normalized.median.age.of.LS.tyres.at.pit.of.neighborhood..58
##      X75..before.pit..race.variant..cumulative.time.in.pits.upto.pit.entry
##      X76..after.pit..indicator.2.tire.change..ref.19
##      X77..after.pit..indicator.4.tire.change..ref.19
##      X82..before.pit..starting.position.of.the.car
##      X86..before.pit..indicator.front..lte.8..of.the.pack.pre.pit
##      X89..before.pit..Indicator.2.tire.AND.bottom..gte.23..of.pack.at.pit.entry...features.76...feature.87.
##      X128..before.pit...Binned.avg.pre.slope..prev.epoch.avg.rank..new
##      X133..before.pit..Binned.avg.prev.slope.HI.RES
##      X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES
##      X139..after.pit...Binned.avg.pre.slope..pit.exit.rank.HI.RES
##      X140..after.pit..Time.in.pits.before.outing
##      X146..before.pit..norm.immediate.prev.slope...immediate.prev.init.rank.2
##      X149..before.pit..exp.of.norm.immediate.prev.slope
##      X150..before.pit..sqrt.of.immediate.prev.avg.rank
##      X151..before.pit..square.of.immediate.prev.avg.rank
##      X153..before.pit..sqrt.of.avg.prev.rank.12
##      X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127
##      X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127
##      X160..before.pit..sqrt.of.Binned.avg.prev.slope.HI.RES.133
##      X161..before.pit..square.of.Binned.avg.prev.slope.HI.RES.133
##      X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137
##      X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137
##      X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137
##      X166..after.pit.total_tire_age.at.pit

That calcium value is quite high, and also 3333 looks like it could be coding a missing value or something (since I think 999 is sometimes used for this), however I can’t find any references to 3333 in the documentation for the data. And plotting the Calcium values, it doesn’t look that crazy, especially after a log transformation:

# boxplot(x[cereal,"Calcium_.mg."], main="Calcium values")
# hist(log(1+x[cereal,"Calcium_.mg."]), main="log(Calcium values)")