library(tidyverse)
library(caret)
library(doMC)
library(ggcorrplot)
registerDoMC(cores=7)
Load Datasets
winedataset_blanco <- read_csv("blanco_train.csv.gz")
Parsed with column specification:
cols(
`fixed acidity` = col_double(),
`volatile acidity` = col_double(),
`citric acid` = col_double(),
`residual sugar` = col_double(),
chlorides = col_double(),
`free sulfur dioxide` = col_double(),
`total sulfur dioxide` = col_double(),
density = col_double(),
pH = col_double(),
sulphates = col_double(),
alcohol = col_double(),
quality = col_integer()
)
winedataset_red <- read_csv("tinto_train.csv.gz")
Parsed with column specification:
cols(
`fixed acidity` = col_double(),
`volatile acidity` = col_double(),
`citric acid` = col_double(),
`residual sugar` = col_double(),
chlorides = col_double(),
`free sulfur dioxide` = col_double(),
`total sulfur dioxide` = col_double(),
density = col_double(),
pH = col_double(),
sulphates = col_double(),
alcohol = col_double(),
quality = col_integer()
)
winedataset_blanco$type="white"
winedataset_red$type="red"
winedataset<-rbind(winedataset_blanco,winedataset_red)
winedataset
winedataset %>% map(is.null)
$`fixed acidity`
[1] FALSE
$`volatile acidity`
[1] FALSE
$`citric acid`
[1] FALSE
$`residual sugar`
[1] FALSE
$chlorides
[1] FALSE
$`free sulfur dioxide`
[1] FALSE
$`total sulfur dioxide`
[1] FALSE
$density
[1] FALSE
$pH
[1] FALSE
$sulphates
[1] FALSE
$alcohol
[1] FALSE
$quality
[1] FALSE
$type
[1] FALSE
winedataset %>% group_by(quality) %>% summarise(total=n())
winedataset %>% group_by(`total sulfur dioxide`,quality) %>% summarise(total=n())
Exploratory Analysis
Correlation Matrix
#Matriz de correlacion
cor_matrix<-cor(winedataset %>% select(-type))
ggcorrplot(cor_matrix)

Boxplot volatile
ggplot(winedataset)+
geom_boxplot(aes(x=as.factor(quality),y=`volatile acidity`,fill=as.factor(quality)))

NA
Boxplot alcohol
ggplot(winedataset)+
geom_boxplot(aes(x=as.factor(quality),y=`alcohol`))
Create categorical features
Create category labels for quality
Create clustering labels (dbscan)
Create clustering labels (kmeans)
Eliminate type
trainset <- trainset %>% select(-type)
names(trainset)
[1] "fixed acidity" "volatile acidity" "citric acid" "residual sugar" "chlorides"
[6] "free sulfur dioxide" "total sulfur dioxide" "density" "pH" "sulphates"
[11] "alcohol" "quality" "cluster"
Split train and test
trainIndex <- createDataPartition(as.factor(trainset$quality), p=0.80, list=FALSE)
data_train <- trainset[ trainIndex,]
data_test <- trainset[-trainIndex,]
colnames(data_train) <- make.names(colnames(data_train))
colnames(data_test) <- make.names(colnames(data_test))
Plot class distribution in train
data_train %>% group_by(quality) %>% summarise(total=n()) %>%
ggplot()+
geom_col(aes(x=quality,y=total,fill=quality))+
theme_classic()

Plot class distribution in test
data_test %>% group_by(quality) %>% summarise(total=n()) %>%
ggplot()+
geom_col(aes(x=quality,y=total,fill=quality))+
theme_classic()

Feature selection
Train model
ctrl_fast <- trainControl(method="cv",
repeats=1,
number=5,
# summaryFunction=twoClassSummary,
verboseIter=T,
classProbs=T,
allowParallel = TRUE)
`repeats` has no meaning for this resampling method.
importance <- varImp(rfFitupsam, scale=FALSE)
plot(importance)

Test model
predsrfprobsamp=predict(rfFitupsam,data_test)
# use for regresion
#confusionMatrix(as.factor(predsrfprobsamp %>% round()),as.factor(data_test$quality))
confusionMatrix(predsrfprobsamp,as.factor(data_test$quality))
Confusion Matrix and Statistics
Reference
Prediction high low medium
high 122 0 45
low 0 3 4
medium 82 37 745
Overall Statistics
Accuracy : 0.8382
95% CI : (0.8143, 0.8601)
No Information Rate : 0.7649
P-Value [Acc > NIR] : 4.354e-09
Kappa : 0.5116
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: high Class: low Class: medium
Sensitivity 0.5980 0.075000 0.9383
Specificity 0.9460 0.995992 0.5123
Pos Pred Value 0.7305 0.428571 0.8623
Neg Pred Value 0.9059 0.964113 0.7184
Prevalence 0.1965 0.038536 0.7649
Detection Rate 0.1175 0.002890 0.7177
Detection Prevalence 0.1609 0.006744 0.8324
Balanced Accuracy 0.7720 0.535496 0.7253

