library(glmnet)
library(varSelRF)
library(FSelector)
library(mlbench)
library(plot3D)
library(knitr)
library(rgl)
knit_hooks$set(webgl = hook_webgl)
We chose to have 3 categories.
A Gain in rank -> Black – Improves rank
No Change in rank -> Red
A Loss in rank -> Green – Falls behind
Load the data
dat <- read.csv("C:/Users/Prashan/Dropbox (MIT)/MIT/Predictive Analytics/code/data/NASCAR_5f/allf_3c_phoenix2_2014_prototype_sel.csv")
Start with 43 features, create the x-attributes, y-label matrices
features <- 3:45
complete <- which(rowSums(is.na(dat[, features]))==0)
datc <- dat[complete, ]
x <- as.matrix(datc[, features])
y <- datc[,2]
cl <- factor(y)
#column one contains the names of the data points (i.e cereal-captian-crunch..)
#take out the data point names and leave only the label and attributes
nascar_data_with_label=datc[,(-1)]
selected_features<-c()
Variable selection from random forests using OOB error
knit_hooks$set(webgl = hook_webgl)
rf.vs1 <- varSelRF(x, cl, ntree = 200, ntreeIterat = 100,
vars.drop.frac = 0.2)
selected variables
rf.vs1$selected.vars
## [1] "X140..after.pit..Time.in.pits.before.outing"
## [2] "X61..before.pit..NBHD.prev.epoch..number.of.4.pit.types.of.neighborhood"
if (length(rf.vs1$selected.vars)==3){
plot3d(x[,rf.vs1$selected.vars[1]],x[,rf.vs1$selected.vars[2]],x[,rf.vs1$selected.vars[3]],col=y,xlab=rf.vs1$selected.vars[1],ylab=rf.vs1$selected.vars[2],zlab=rf.vs1$selected.vars[3])
}else if (length(rf.vs1$selected.vars)==2){
plot(x[,rf.vs1$selected.vars[1]],x[,rf.vs1$selected.vars[2]],col=y,xlab=rf.vs1$selected.vars[1],ylab=rf.vs1$selected.vars[2])
}
# plot(rf.vs1)
Variable selection from CFS filter (https://cran.r-project.org/web/packages/FSelector/FSelector.pdf)
knit_hooks$set(webgl = hook_webgl)
subset <- cfs(rank_change_label~., datc[,(-1)])
selected variables
subset
## [1] "X76..after.pit..indicator.2.tire.change..ref.19"
## [2] "X86..before.pit..indicator.front..lte.8..of.the.pack.pre.pit"
## [3] "X89..before.pit..Indicator.2.tire.AND.bottom..gte.23..of.pack.at.pit.entry...features.76...feature.87."
## [4] "X140..after.pit..Time.in.pits.before.outing"
## [5] "X160..before.pit..sqrt.of.Binned.avg.prev.slope.HI.RES.133"
plot3d(x[,subset[1]],x[,subset[2]],x[,subset[3]],col=y,xlab=subset[1],ylab=subset[2],zlab=subset[3])
You must enable Javascript to view this page properly.
Variable selection from Consistency-based filter (https://cran.r-project.org/web/packages/FSelector/FSelector.pdf)
knit_hooks$set(webgl = hook_webgl)
subset <- consistency(rank_change_label~., datc[,(-1)])
You must enable Javascript to view this page properly.
selected variables
subset
## [1] "X26..before.pit..incoming.rank.before.pit.bef.leg"
## [2] "X34..before.pit..25th.percentile.rank.upto.bef.pit"
## [3] "X35..before.pit..75th.percentile.rank.upto.bef.pit"
## [4] "X82..before.pit..starting.position.of.the.car"
plot3d(x[,subset[1]],x[,subset[2]],x[,subset[3]],col=y,xlab=subset[1],ylab=subset[2],zlab=subset[3])
selected_features<-c(selected_features,subset)
You must enable Javascript to view this page properly.
Variable selection from RReliefF filter (https://cran.r-project.org/web/packages/FSelector/FSelector.pdf)
knit_hooks$set(webgl = hook_webgl)
weights <- relief(rank_change_label~., datc[,(-1)], neighbours.count = 5, sample.size = 20)
subset <- cutoff.k(weights,5)
You must enable Javascript to view this page properly.
selected variables
subset
## [1] "X140..after.pit..Time.in.pits.before.outing"
## [2] "X21..after.pit..race.variant..age.of.RS.tyre.at.pit"
## [3] "X41..before.pit..immediate.prev.rate.of.change.in.rank"
## [4] "X61..before.pit..NBHD.prev.epoch..number.of.4.pit.types.of.neighborhood"
## [5] "X37..before.pit..prev.min.change.in.rank.over.all.epochs"
plot3d(x[,subset[1]],x[,subset[2]],x[,subset[3]],col=y,xlab=subset[1],ylab=subset[2],zlab=subset[3])
selected_features<-c(selected_features,subset)
#RandomForest filter
#1=mean decrease in accuracy, 2=mean decrease in node impurity
# weights <- random.forest.importance(rank_change_label~., datc[,(-1)], importance.type = 2)
# subset <- cutoff.biggest.diff(weights)
# subset
You must enable Javascript to view this page properly.
Variable selection from Information Gain (https://cran.r-project.org/web/packages/FSelector/FSelector.pdf)
knit_hooks$set(webgl = hook_webgl)
weights <- information.gain(rank_change_label~., nascar_data_with_label)
subset <- cutoff.biggest.diff(weights)
You must enable Javascript to view this page properly.
selected variables
subset
## [1] "X26..before.pit..incoming.rank.before.pit.bef.leg"
## [2] "X82..before.pit..starting.position.of.the.car"
## [3] "X34..before.pit..25th.percentile.rank.upto.bef.pit"
## [4] "X35..before.pit..75th.percentile.rank.upto.bef.pit"
## [5] "X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES"
## [6] "X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127"
## [7] "X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127"
## [8] "X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"
## [9] "X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"
## [10] "X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"
plot3d(x[,subset[1]],x[,subset[2]],x[,subset[3]],col=y,xlab=subset[1],ylab=subset[2],zlab=subset[3])
selected_features<-c(selected_features,subset)
unique_f<-unique(selected_features)
unique_f
## [1] "X26..before.pit..incoming.rank.before.pit.bef.leg"
## [2] "X34..before.pit..25th.percentile.rank.upto.bef.pit"
## [3] "X35..before.pit..75th.percentile.rank.upto.bef.pit"
## [4] "X82..before.pit..starting.position.of.the.car"
## [5] "X140..after.pit..Time.in.pits.before.outing"
## [6] "X21..after.pit..race.variant..age.of.RS.tyre.at.pit"
## [7] "X41..before.pit..immediate.prev.rate.of.change.in.rank"
## [8] "X61..before.pit..NBHD.prev.epoch..number.of.4.pit.types.of.neighborhood"
## [9] "X37..before.pit..prev.min.change.in.rank.over.all.epochs"
## [10] "X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES"
## [11] "X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127"
## [12] "X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127"
## [13] "X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"
## [14] "X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"
## [15] "X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137"
subset_x=x[,unique_f]
You must enable Javascript to view this page properly.
lasso with a multinomial loss. Here we train on the full set of features and see that even with maxit=1000000 there is no convergence.
train <- sample(nrow(x[]), round(nrow(x) * 0.7))
cvfit <- cv.glmnet(x[train, ], y[train], family="multinomial",maxit=1000000)
## Warning: from glmnet Fortran code (error code -72); Convergence for 72th
## lambda value not reached after maxit=1000000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -96); Convergence for 96th
## lambda value not reached after maxit=1000000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -78); Convergence for 78th
## lambda value not reached after maxit=1000000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -77); Convergence for 77th
## lambda value not reached after maxit=1000000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -72); Convergence for 72th
## lambda value not reached after maxit=1000000 iterations; solutions for
## larger lambdas returned
plot(cvfit) # let's look at the cross validated error to choose lambda
You must enable Javascript to view this page properly.
We train on the data with a subset of features
train <- sample(nrow(subset_x), round(nrow(subset_x) * 0.7))
cvfit <- cv.glmnet(subset_x[train, ], y[train], family="multinomial",maxit=1000000)
plot(cvfit) # let's look at the cross validated error to choose lambda
You must enable Javascript to view this page properly.
coef(cvfit, s="lambda.1se")
## $`1`
## 16 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.4148554
## X26..before.pit..incoming.rank.before.pit.bef.leg .
## X34..before.pit..25th.percentile.rank.upto.bef.pit .
## X35..before.pit..75th.percentile.rank.upto.bef.pit .
## X82..before.pit..starting.position.of.the.car .
## X140..after.pit..Time.in.pits.before.outing .
## X21..after.pit..race.variant..age.of.RS.tyre.at.pit .
## X41..before.pit..immediate.prev.rate.of.change.in.rank .
## X61..before.pit..NBHD.prev.epoch..number.of.4.pit.types.of.neighborhood .
## X37..before.pit..prev.min.change.in.rank.over.all.epochs .
## X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES .
## X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127 .
## X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127 .
## X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137 .
## X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137 .
## X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137 .
##
## $`2`
## 16 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -0.3960748
## X26..before.pit..incoming.rank.before.pit.bef.leg .
## X34..before.pit..25th.percentile.rank.upto.bef.pit .
## X35..before.pit..75th.percentile.rank.upto.bef.pit .
## X82..before.pit..starting.position.of.the.car .
## X140..after.pit..Time.in.pits.before.outing .
## X21..after.pit..race.variant..age.of.RS.tyre.at.pit .
## X41..before.pit..immediate.prev.rate.of.change.in.rank .
## X61..before.pit..NBHD.prev.epoch..number.of.4.pit.types.of.neighborhood .
## X37..before.pit..prev.min.change.in.rank.over.all.epochs .
## X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES .
## X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127 .
## X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127 .
## X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137 .
## X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137 .
## X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137 .
##
## $`3`
## 16 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -0.01878058
## X26..before.pit..incoming.rank.before.pit.bef.leg .
## X34..before.pit..25th.percentile.rank.upto.bef.pit .
## X35..before.pit..75th.percentile.rank.upto.bef.pit .
## X82..before.pit..starting.position.of.the.car .
## X140..after.pit..Time.in.pits.before.outing .
## X21..after.pit..race.variant..age.of.RS.tyre.at.pit .
## X41..before.pit..immediate.prev.rate.of.change.in.rank .
## X61..before.pit..NBHD.prev.epoch..number.of.4.pit.types.of.neighborhood .
## X37..before.pit..prev.min.change.in.rank.over.all.epochs .
## X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES .
## X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127 .
## X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127 .
## X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137 .
## X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137 .
## X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137 .
yhat <- predict(cvfit, newx=subset_x[-train, ], s="lambda.1se", type="class")
mean(y[-train] != yhat) # classification error
## [1] 0.4791667
table(y[-train], yhat) # confusion matrix
## yhat
## 1
## 1 25
## 2 6
## 3 17
We look at the pairwise plots of the features selected by the various algorithms.
# pairs(x, pch="o", col=y, lower.panel=NULL)
pairs(subset_x, pch="o", col=y, lower.panel=NULL)
Using only the selected subset of unique features
pca <- prcomp(subset_x, scale=TRUE)
plot(pca$x, col=y, pch=20)
The first two principal component directions represent the following linear combinations of nutrients:
pca$rotation[,1:2]
## PC1
## X26..before.pit..incoming.rank.before.pit.bef.leg 0.327763570
## X34..before.pit..25th.percentile.rank.upto.bef.pit 0.332897428
## X35..before.pit..75th.percentile.rank.upto.bef.pit 0.331663102
## X82..before.pit..starting.position.of.the.car 0.318676551
## X140..after.pit..Time.in.pits.before.outing 0.083945269
## X21..after.pit..race.variant..age.of.RS.tyre.at.pit -0.086871107
## X41..before.pit..immediate.prev.rate.of.change.in.rank 0.005389654
## X61..before.pit..NBHD.prev.epoch..number.of.4.pit.types.of.neighborhood 0.103631095
## X37..before.pit..prev.min.change.in.rank.over.all.epochs 0.020739322
## X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES 0.338731203
## X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127 0.327162559
## X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127 0.309152473
## X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137 0.333154555
## X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137 0.328357209
## X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137 0.090793084
## PC2
## X26..before.pit..incoming.rank.before.pit.bef.leg -0.005552579
## X34..before.pit..25th.percentile.rank.upto.bef.pit -0.046894227
## X35..before.pit..75th.percentile.rank.upto.bef.pit -0.088282095
## X82..before.pit..starting.position.of.the.car -0.149965751
## X140..after.pit..Time.in.pits.before.outing 0.025022452
## X21..after.pit..race.variant..age.of.RS.tyre.at.pit -0.281360398
## X41..before.pit..immediate.prev.rate.of.change.in.rank 0.615307122
## X61..before.pit..NBHD.prev.epoch..number.of.4.pit.types.of.neighborhood -0.235295877
## X37..before.pit..prev.min.change.in.rank.over.all.epochs 0.651479184
## X137..before.pit...Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES 0.010908060
## X157..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.127 0.060309169
## X158..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.127 0.106668690
## X163..before.pit..sqrt.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137 0.005166053
## X164..before.pit..square.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137 0.020576760
## X165..before.pit..exp.of..Binned.avg.pre.slope..prev.epoch.final.rank.HI.RES.137 0.117133407