Boruta Feature Analysis

Boruta is a wrapper algorithm that uses Random Forest models to determine the feature importance of variables towards some target variable.

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(data.table)
library(Boruta)
## Loading required package: ranger
library(plyr)
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:data.table':
## 
##     between, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(readr)

ID.VAR <- "rownames"
TARGET.VAR <- "quality"

#Get Data
red.df <- read.csv(file.path("/Users/robertslattery/Desktop/Data Analytics Applications/winequality/winequality-red.csv"), header = TRUE, sep = ";", stringsAsFactors = FALSE)
white.df <- read.csv(file.path("/Users/robertslattery/Desktop/Data Analytics Applications/winequality/winequality-white.csv"), header = TRUE, sep = ";", stringsAsFactors = FALSE)
sample.df <- rbind(red.df, white.df)

#Get Variable Names
candidate.features <- setdiff(names(sample.df), c(ID.VAR, TARGET.VAR))
data.type <- sapply(candidate.features, function(x){class(sample.df[[x]])})
table(data.type)
## data.type
## numeric 
##      11
print(data.type)
##        fixed.acidity     volatile.acidity          citric.acid 
##            "numeric"            "numeric"            "numeric" 
##       residual.sugar            chlorides  free.sulfur.dioxide 
##            "numeric"            "numeric"            "numeric" 
## total.sulfur.dioxide              density                   pH 
##            "numeric"            "numeric"            "numeric" 
##            sulphates              alcohol 
##            "numeric"            "numeric"
#Get Data Types
explanatory.attributes <- setdiff(names(sample.df),c(ID.VAR,TARGET.VAR))
data.classes <- sapply(explanatory.attributes,function(x){class(sample.df[[x]])})
unique.classes <- unique(data.classes)
attr.data.types <- lapply(unique.classes,function(x){names(data.classes[data.classes==x])})
names(attr.data.types) <- unique.classes

#Set desired Response Variable
response <- sample.df$quality

#Remove Features
sample.df <- sample.df[candidate.features]

#Missing Values
for (x in attr.data.types$integer){
  sample.df[[x]][is.na(sample.df[[x]])] <- -1
}

for (x in attr.data.types$character){
  sample.df[[x]][is.na(sample.df[[x]])] <- "*MISSING*"
}


#Run Boruta Analysis
set.seed(13)
bor.results <- Boruta(sample.df, response, maxRuns=101, doTrace=0)

#Show summary of results and plots
summary(bor.results)
##               Length Class    Mode     
## finalDecision  11    factor   numeric  
## ImpHistory    154    -none-   numeric  
## pValue          1    -none-   numeric  
## maxRuns         1    -none-   numeric  
## light           1    -none-   logical  
## mcAdj           1    -none-   logical  
## timeTaken       1    difftime numeric  
## roughfixed      1    -none-   logical  
## call            5    -none-   call     
## impSource       1    -none-   character
print(bor.results)
## Boruta performed 11 iterations in 59.69156 secs.
##  11 attributes confirmed important: alcohol, chlorides,
## citric.acid, density, fixed.acidity and 6 more.
##  No attributes deemed unimportant.
getSelectedAttributes(bor.results)
##  [1] "fixed.acidity"        "volatile.acidity"     "citric.acid"         
##  [4] "residual.sugar"       "chlorides"            "free.sulfur.dioxide" 
##  [7] "total.sulfur.dioxide" "density"              "pH"                  
## [10] "sulphates"            "alcohol"
arrange(cbind(attr=rownames(attStats(bor.results)), attStats(bor.results)),desc(medianImp))
##                    attr  meanImp medianImp   minImp   maxImp normHits
## 1               alcohol 66.93247  66.37746 64.02726 71.47146        1
## 2      volatile.acidity 58.76267  58.51803 56.12248 63.09908        1
## 3   free.sulfur.dioxide 50.06889  49.44554 48.57471 52.37226        1
## 4             sulphates 45.11266  45.20490 42.44815 49.14886        1
## 5  total.sulfur.dioxide 40.36112  40.63149 37.52594 41.96431        1
## 6        residual.sugar 40.69676  39.90029 37.06348 44.88182        1
## 7           citric.acid 39.26786  39.09480 37.42311 42.06390        1
## 8                    pH 37.91077  38.56443 33.61972 40.39967        1
## 9             chlorides 37.08763  36.86928 35.67585 38.63937        1
## 10              density 36.00408  36.04931 34.01104 37.40516        1
## 11        fixed.acidity 33.53393  33.47591 29.63287 36.59272        1
##     decision
## 1  Confirmed
## 2  Confirmed
## 3  Confirmed
## 4  Confirmed
## 5  Confirmed
## 6  Confirmed
## 7  Confirmed
## 8  Confirmed
## 9  Confirmed
## 10 Confirmed
## 11 Confirmed
plot(bor.results, xlim = c(4,14))

Boruta is a Gradient Boosted Machine based package that determines variable importance. Running the package on the UC:Irvine Wine Dataset gives the following variable importance rankings: Alcohol, Volatice Acidity, Free Sulfur Dioxide, Sulphates, Total Sulfur Dioxide, Residual Sugar, Citric Acid, pH, Chlorides, Density and Fixed Acidity.