Adena Lin
November 18, 2015
Dataset from http://archive.ics.uci.edu/ml/datasets/Wine+Quality.
library(caret)
# Load in data
redwine <- read.csv2("~/Documents/Current Courses/PSYCH 486/Wine/winequality-red.csv")
whitewine <- read.csv2("~/Documents/Current Courses/PSYCH 486/Wine/winequality-white.csv")
I used 2 methods of selecting predictors:
(1) Manual selection based on correlations
(2) Automatic feature selection
# Center, scale, and transform red wine data
preprocess_redwine <- preProcess(redwine[,1:11], c("BoxCox", "center", "scale"))
new_redwine <- data.frame(trans = predict(preprocess_redwine, redwine))
Box Cox Transformation
Yλwhere Y would be the values in the dataset and λ would be the exponent (ranges from -5 to 5).
Examples
If λ = 2, all the values would be squared.
If λ = 0.5, the square root of the values would be taken.
If λ = 0, the log of the values would be taken.
Chlorides: pre-transformation
Post-transformation
Using boxplots to check for outliers
boxplot(new_redwine$trans.fixed.acidity)
boxplot(new_redwine$trans.volatile.acidity)
boxplot(new_redwine$trans.citric.acid)
boxplot(new_redwine$trans.residual.sugar)
boxplot(new_redwine$trans.chlorides)
boxplot(new_redwine$trans.free.sulfur.dioxide)
boxplot(new_redwine$trans.total.sulfur.dioxide) # no outliers
boxplot(new_redwine$trans.density)
boxplot(new_redwine$trans.pH)
boxplot(new_redwine$trans.sulphates)
boxplot(new_redwine$trans.alcohol) # no outliers
After scaling, centering and transforming data, we remove outliers outside of 3 SDs
new_redwine <- new_redwine[!abs(new_redwine$trans.fixed.acidity) > 3,]
This would be done for all predictors with outliers.
# Find correlations between predictors and quality
library(corrplot)
corrplot(cor(new_redwine), type = "lower", method = "number")
(Top 5) Predictors with highest correlations with quality:
Same steps would be repeated to find predictors for white wine, which will be
redCollinear <- lm(quality ~ ., data = redwine)
vif(redCollinear)
# No VIF > 10 for red wine, no variables highly correlated
whiteCollinear <- lm(quality ~., data = whitewine)
vif(whiteCollinear)
corrplot(cor(whitewine), method = "number")
# For variables of interest, alcohol and density have very high VIFs
For White Wine, alcohol and density were highly correlated (~0.80).
As alcohol had higher average correlations to the rest of the variables, it was removed as a variable for white wine.
# Create new data frame with variables of interest for red wine and white wine
redwData <- data.frame(new_redwine$trans.volatile.acidity, new_redwine$trans.citric.acid, new_redwine$trans.chlorides, new_redwine$trans.total.sulfur.dioxide, new_redwine$trans.alcohol, new_redwine$trans.quality)
whitewData <- data.frame(new_whitewine$trans.volatile.acidity, new_whitewine$trans.chlorides, new_whitewine$trans.total.sulfur.dioxide, new_whitewine$trans.density, new_whitewine$trans.quality)
# 75/25 train/test split
# RED WINE
set.seed(1)
redSplit <- createDataPartition(redwData$quality,
p = 0.75,
list = FALSE)
redTrain <- redwData[redSplit,]
redTest <- redwData[-redSplit,]
# WHITE WINE
whiteSplit <- createDataPartition(whitewData$quality,
p = 0.75,
list = FALSE)
whiteTrain <- whitewData[whiteSplit,]
whiteTest <- whitewData[-whiteSplit,]
Using K-fold cross validation with 10 folds resampled 10 times.
fitControl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 10)
The next few slides will go over the following regression models
…for red wine, although white wine models will be built the same way using different predictors.
# Train red wine KNN model
set.seed(2)
redKnn1 <- train(quality ~.,
data = redTrain,
method = "knn",
tuneGrid = data.frame(.k = 1:30),
trControl = fitControl)
# Test KNN model on red wine test data
redw_knnPredictions1 <- predict(redKnn1, newdata = redTest)
r2_redw_knn1 <- R2(redw_knnPredictions1, redTest$quality)
rmse_redw_knn1 <- RMSE(redw_knnPredictions1, redTest$quality)
R2 = 0.3099295, RMSE = 0.6752208
# Train red wine neural networks model
set.seed(4)
redNN_1 <- train(quality ~., data = redTrain, method = "nnet", linout = TRUE, maxit = 100, tuneGrid = expand.grid(.size=c(1:5), .decay=c(0,0.001,0.01,0.1)), trControl = fitControl)
# Test model
redw_NNPredictions1 <- predict(redNN_1, newdata = redTest)
r2_redw_nn1 <- R2(redw_NNPredictions1, redTest$quality)
rmse_redw_nn1 <- RMSE(redw_NNPredictions1, redTest$quality)
R2 = 0.3169731, RMSE = 0.6714897
# Train
set.seed(7)
redSVM_1 <- train(quality ~.,
data = redTrain,
method = "svmRadial",
tuneLength = 5,
trControl = fitControl)
# Test
redw_SVMPredictions1 <- predict(redSVM_1, newdata = redTest)
r2_redw_svm1 <- R2(redw_SVMPredictions1, redTest$quality)
rmse_redw_svm1 <- RMSE(redw_SVMPredictions1, redTest$quality)
R2 = 0.3214969, RMSE = 0.6716413
KNN
NN
SVM
ctrl <- rfeControl(method = "repeatedcv",
repeats = 5,
verbose = TRUE,
functions = lmFuncs)
redwineRFE <- rfe(x = new_redwine[,1:11],
y = new_redwine[,12],
sizes = c(1:11),
metric = "RMSE",
rfeControl = ctrl)
whitewineRFE <- rfe(x = new_whitewine[,1:11],
y = new_whitewine[,12],
sizes = c(1:11),
metric = "RMSE",
rfeControl = ctrl)
Red Wine Results from Feature Selection
White Wine Results
5 new predictors for each wine
redw_fs <- data.frame(new_redwine$trans.volatile.acidity, new_redwine$trans.total.sulfur.dioxide, new_redwine$trans.density, new_redwine$trans.sulphates, new_redwine$trans.alcohol, new_redwine$trans.quality)
whitew_fs <- data.frame(new_whitewine$trans.volatile.acidity, new_whitewine$trans.residual.sugar, new_whitewine$trans.free.sulfur.dioxide, new_whitewine$trans.density, new_whitewine$trans.alcohol, new_whitewine$trans.quality)
# RED WINE
redSplit <- createDataPartition(redw_fs$quality, p = 0.75, list = FALSE)
redTrain <- redw_fs[redSplit,]
redTest <- redw_fs[-redSplit,]
# WHITE WINE
whiteSplit <- createDataPartition(whitew_fs$quality, p = 0.75, list = FALSE)
whiteTrain <- whitew_fs[whiteSplit,]
whiteTest <- whitew_fs[-whiteSplit,]
& then run through KNN, NN, and SVM models (similar to before with manual feature selection) with their respective 5 variables.
KNN
NN
SVM
KNN
NN
SVM
(1) Automatic feature selection models performed better.
(2) Red wine models overall performed better than white wine models.
(3) Performance of model types (KNN vs. NN vs. SVM) were relatively similar overall.