library(tidyverse)
library(caret)
library(doMC)
library(ggcorrplot)
registerDoMC(cores=7)

Load Datasets

winedataset_blanco <- read_csv("blanco_train.csv.gz")
Parsed with column specification:
cols(
  `fixed acidity` = col_double(),
  `volatile acidity` = col_double(),
  `citric acid` = col_double(),
  `residual sugar` = col_double(),
  chlorides = col_double(),
  `free sulfur dioxide` = col_double(),
  `total sulfur dioxide` = col_double(),
  density = col_double(),
  pH = col_double(),
  sulphates = col_double(),
  alcohol = col_double(),
  quality = col_integer()
)
winedataset_red <- read_csv("tinto_train.csv.gz")
Parsed with column specification:
cols(
  `fixed acidity` = col_double(),
  `volatile acidity` = col_double(),
  `citric acid` = col_double(),
  `residual sugar` = col_double(),
  chlorides = col_double(),
  `free sulfur dioxide` = col_double(),
  `total sulfur dioxide` = col_double(),
  density = col_double(),
  pH = col_double(),
  sulphates = col_double(),
  alcohol = col_double(),
  quality = col_integer()
)
winedataset_blanco$type="white"
winedataset_red$type="red"
winedataset<-rbind(winedataset_blanco,winedataset_red)
winedataset
winedataset %>% map(is.null)
$`fixed acidity`
[1] FALSE

$`volatile acidity`
[1] FALSE

$`citric acid`
[1] FALSE

$`residual sugar`
[1] FALSE

$chlorides
[1] FALSE

$`free sulfur dioxide`
[1] FALSE

$`total sulfur dioxide`
[1] FALSE

$density
[1] FALSE

$pH
[1] FALSE

$sulphates
[1] FALSE

$alcohol
[1] FALSE

$quality
[1] FALSE

$type
[1] FALSE
winedataset %>% group_by(quality) %>% summarise(total=n())
winedataset %>% group_by(`total sulfur dioxide`,quality)  %>% summarise(total=n())

Exploratory Analysis

Correlation Matrix

#Matriz de correlacion
cor_matrix<-cor(winedataset %>% select(-type))
ggcorrplot(cor_matrix)

Boxplot volatile

ggplot(winedataset)+
  geom_boxplot(aes(x=as.factor(quality),y=`volatile acidity`,fill=as.factor(quality)))

NA

Boxplot alcohol

ggplot(winedataset)+
  geom_boxplot(aes(x=as.factor(quality),y=`alcohol`))

Create categorical features

Create category labels for quality

Create clustering labels (dbscan)

Create clustering labels (kmeans)

Eliminate type

trainset <- trainset %>% select(-type)
names(trainset)
 [1] "fixed acidity"        "volatile acidity"     "citric acid"          "residual sugar"       "chlorides"           
 [6] "free sulfur dioxide"  "total sulfur dioxide" "density"              "pH"                   "sulphates"           
[11] "alcohol"              "quality"              "cluster"             

Split train and test

trainIndex <- createDataPartition(as.factor(trainset$quality), p=0.80, list=FALSE)
data_train <- trainset[ trainIndex,]
data_test <-  trainset[-trainIndex,]
colnames(data_train) <- make.names(colnames(data_train))
colnames(data_test) <- make.names(colnames(data_test))

Plot class distribution in train

data_train  %>% group_by(quality) %>% summarise(total=n()) %>%
  ggplot()+
  geom_col(aes(x=quality,y=total,fill=quality))+
  theme_classic()

Plot class distribution in test

data_test  %>% group_by(quality) %>% summarise(total=n()) %>%
  ggplot()+
  geom_col(aes(x=quality,y=total,fill=quality))+
  theme_classic()

Feature selection

Train model

ctrl_fast <- trainControl(method="cv", 
                     repeats=1,
                     number=5, 
                   #  summaryFunction=twoClassSummary,
                     verboseIter=T,
                     classProbs=T,
                     allowParallel = TRUE)  
`repeats` has no meaning for this resampling method.
importance <- varImp(rfFitupsam, scale=FALSE)
plot(importance)

Test model

predsrfprobsamp=predict(rfFitupsam,data_test)
# use for regresion
#confusionMatrix(as.factor(predsrfprobsamp %>% round()),as.factor(data_test$quality))
confusionMatrix(predsrfprobsamp,as.factor(data_test$quality))
Confusion Matrix and Statistics

          Reference
Prediction high low medium
    high    122   0     45
    low       0   3      4
    medium   82  37    745

Overall Statistics
                                          
               Accuracy : 0.8382          
                 95% CI : (0.8143, 0.8601)
    No Information Rate : 0.7649          
    P-Value [Acc > NIR] : 4.354e-09       
                                          
                  Kappa : 0.5116          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: high Class: low Class: medium
Sensitivity               0.5980   0.075000        0.9383
Specificity               0.9460   0.995992        0.5123
Pos Pred Value            0.7305   0.428571        0.8623
Neg Pred Value            0.9059   0.964113        0.7184
Prevalence                0.1965   0.038536        0.7649
Detection Rate            0.1175   0.002890        0.7177
Detection Prevalence      0.1609   0.006744        0.8324
Balanced Accuracy         0.7720   0.535496        0.7253

