Boruta is a wrapper algorithm that uses Random Forest models to determine the feature importance of variables towards some target variable.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(data.table)
library(Boruta)
## Loading required package: ranger
library(plyr)
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:data.table':
##
## between, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(readr)
ID.VAR <- "rownames"
TARGET.VAR <- "quality"
#Get Data
red.df <- read.csv(file.path("/Users/robertslattery/Desktop/Data Analytics Applications/winequality/winequality-red.csv"), header = TRUE, sep = ";", stringsAsFactors = FALSE)
white.df <- read.csv(file.path("/Users/robertslattery/Desktop/Data Analytics Applications/winequality/winequality-white.csv"), header = TRUE, sep = ";", stringsAsFactors = FALSE)
sample.df <- rbind(red.df, white.df)
#Get Variable Names
candidate.features <- setdiff(names(sample.df), c(ID.VAR, TARGET.VAR))
data.type <- sapply(candidate.features, function(x){class(sample.df[[x]])})
table(data.type)
## data.type
## numeric
## 11
print(data.type)
## fixed.acidity volatile.acidity citric.acid
## "numeric" "numeric" "numeric"
## residual.sugar chlorides free.sulfur.dioxide
## "numeric" "numeric" "numeric"
## total.sulfur.dioxide density pH
## "numeric" "numeric" "numeric"
## sulphates alcohol
## "numeric" "numeric"
#Get Data Types
explanatory.attributes <- setdiff(names(sample.df),c(ID.VAR,TARGET.VAR))
data.classes <- sapply(explanatory.attributes,function(x){class(sample.df[[x]])})
unique.classes <- unique(data.classes)
attr.data.types <- lapply(unique.classes,function(x){names(data.classes[data.classes==x])})
names(attr.data.types) <- unique.classes
#Set desired Response Variable
response <- sample.df$quality
#Remove Features
sample.df <- sample.df[candidate.features]
#Missing Values
for (x in attr.data.types$integer){
sample.df[[x]][is.na(sample.df[[x]])] <- -1
}
for (x in attr.data.types$character){
sample.df[[x]][is.na(sample.df[[x]])] <- "*MISSING*"
}
#Run Boruta Analysis
set.seed(13)
bor.results <- Boruta(sample.df, response, maxRuns=101, doTrace=0)
#Show summary of results and plots
summary(bor.results)
## Length Class Mode
## finalDecision 11 factor numeric
## ImpHistory 154 -none- numeric
## pValue 1 -none- numeric
## maxRuns 1 -none- numeric
## light 1 -none- logical
## mcAdj 1 -none- logical
## timeTaken 1 difftime numeric
## roughfixed 1 -none- logical
## call 5 -none- call
## impSource 1 -none- character
print(bor.results)
## Boruta performed 11 iterations in 59.69156 secs.
## 11 attributes confirmed important: alcohol, chlorides,
## citric.acid, density, fixed.acidity and 6 more.
## No attributes deemed unimportant.
getSelectedAttributes(bor.results)
## [1] "fixed.acidity" "volatile.acidity" "citric.acid"
## [4] "residual.sugar" "chlorides" "free.sulfur.dioxide"
## [7] "total.sulfur.dioxide" "density" "pH"
## [10] "sulphates" "alcohol"
arrange(cbind(attr=rownames(attStats(bor.results)), attStats(bor.results)),desc(medianImp))
## attr meanImp medianImp minImp maxImp normHits
## 1 alcohol 66.93247 66.37746 64.02726 71.47146 1
## 2 volatile.acidity 58.76267 58.51803 56.12248 63.09908 1
## 3 free.sulfur.dioxide 50.06889 49.44554 48.57471 52.37226 1
## 4 sulphates 45.11266 45.20490 42.44815 49.14886 1
## 5 total.sulfur.dioxide 40.36112 40.63149 37.52594 41.96431 1
## 6 residual.sugar 40.69676 39.90029 37.06348 44.88182 1
## 7 citric.acid 39.26786 39.09480 37.42311 42.06390 1
## 8 pH 37.91077 38.56443 33.61972 40.39967 1
## 9 chlorides 37.08763 36.86928 35.67585 38.63937 1
## 10 density 36.00408 36.04931 34.01104 37.40516 1
## 11 fixed.acidity 33.53393 33.47591 29.63287 36.59272 1
## decision
## 1 Confirmed
## 2 Confirmed
## 3 Confirmed
## 4 Confirmed
## 5 Confirmed
## 6 Confirmed
## 7 Confirmed
## 8 Confirmed
## 9 Confirmed
## 10 Confirmed
## 11 Confirmed
plot(bor.results, xlim = c(4,14))