library(tidyverse)
library(caret)
library(doMC)
library(ggcorrplot)
registerDoMC(cores=7)

Load Datasets

winedataset_blanco <- read_csv("blanco_train.csv.gz")
Parsed with column specification:
cols(
  `fixed acidity` = col_double(),
  `volatile acidity` = col_double(),
  `citric acid` = col_double(),
  `residual sugar` = col_double(),
  chlorides = col_double(),
  `free sulfur dioxide` = col_double(),
  `total sulfur dioxide` = col_double(),
  density = col_double(),
  pH = col_double(),
  sulphates = col_double(),
  alcohol = col_double(),
  quality = col_integer()
)
winedataset_red <- read_csv("tinto_train.csv.gz")
Parsed with column specification:
cols(
  `fixed acidity` = col_double(),
  `volatile acidity` = col_double(),
  `citric acid` = col_double(),
  `residual sugar` = col_double(),
  chlorides = col_double(),
  `free sulfur dioxide` = col_double(),
  `total sulfur dioxide` = col_double(),
  density = col_double(),
  pH = col_double(),
  sulphates = col_double(),
  alcohol = col_double(),
  quality = col_integer()
)
winedataset_blanco$type="white"
winedataset_red$type="red"
winedataset<-rbind(winedataset_blanco,winedataset_red)
winedataset
winedataset %>% map(is.null)
$`fixed acidity`
[1] FALSE

$`volatile acidity`
[1] FALSE

$`citric acid`
[1] FALSE

$`residual sugar`
[1] FALSE

$chlorides
[1] FALSE

$`free sulfur dioxide`
[1] FALSE

$`total sulfur dioxide`
[1] FALSE

$density
[1] FALSE

$pH
[1] FALSE

$sulphates
[1] FALSE

$alcohol
[1] FALSE

$quality
[1] FALSE

$type
[1] FALSE
winedataset %>% group_by(quality) %>% summarise(total=n())
winedataset %>% group_by(`total sulfur dioxide`,quality)  %>% summarise(total=n())

Exploratory Analysis

Correlation Matrix

#Matriz de correlacion
cor_matrix<-cor(winedataset %>% select(-type))
ggcorrplot(cor_matrix)

Boxplot volatile

ggplot(winedataset)+
  geom_boxplot(aes(x=as.factor(quality),y=`volatile acidity`,fill=as.factor(quality)))

NA

Boxplot alcohol

ggplot(winedataset)+
  geom_boxplot(aes(x=as.factor(quality),y=`alcohol`))

Create categorical features

Create category labels for quality

Create clustering labels (dbscan)

Create clustering labels (kmeans)

Eliminate type

trainset <- trainset %>% select(-type)
names(trainset)
 [1] "fixed acidity"        "volatile acidity"     "citric acid"          "residual sugar"       "chlorides"           
 [6] "free sulfur dioxide"  "total sulfur dioxide" "density"              "pH"                   "sulphates"           
[11] "alcohol"              "quality"              "cluster"             

Split train and test

trainIndex <- createDataPartition(as.factor(trainset$quality), p=0.80, list=FALSE)
data_train <- trainset[ trainIndex,]
data_test <-  trainset[-trainIndex,]
colnames(data_train) <- make.names(colnames(data_train))
colnames(data_test) <- make.names(colnames(data_test))

Plot class distribution in train

data_train  %>% group_by(quality) %>% summarise(total=n()) %>%
  ggplot()+
  geom_col(aes(x=quality,y=total,fill=quality))+
  theme_classic()

Plot class distribution in test

data_test  %>% group_by(quality) %>% summarise(total=n()) %>%
  ggplot()+
  geom_col(aes(x=quality,y=total,fill=quality))+
  theme_classic()

Feature selection

Train model

ctrl_fast <- trainControl(method="cv", 
                     repeats=1,
                     number=5, 
                   #  summaryFunction=twoClassSummary,
                     verboseIter=T,
                     classProbs=T,
                     allowParallel = TRUE)  
`repeats` has no meaning for this resampling method.
importance <- varImp(rfFitupsam, scale=FALSE)
plot(importance)

Test model

predsrfprobsamp=predict(rfFitupsam,data_test)
# use for regresion
#confusionMatrix(as.factor(predsrfprobsamp %>% round()),as.factor(data_test$quality))
confusionMatrix(predsrfprobsamp,as.factor(data_test$quality))
Confusion Matrix and Statistics

          Reference
Prediction high low medium
    high    122   0     45
    low       0   3      4
    medium   82  37    745

Overall Statistics
                                          
               Accuracy : 0.8382          
                 95% CI : (0.8143, 0.8601)
    No Information Rate : 0.7649          
    P-Value [Acc > NIR] : 4.354e-09       
                                          
                  Kappa : 0.5116          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: high Class: low Class: medium
Sensitivity               0.5980   0.075000        0.9383
Specificity               0.9460   0.995992        0.5123
Pos Pred Value            0.7305   0.428571        0.8623
Neg Pred Value            0.9059   0.964113        0.7184
Prevalence                0.1965   0.038536        0.7649
Detection Rate            0.1175   0.002890        0.7177
Detection Prevalence      0.1609   0.006744        0.8324
Balanced Accuracy         0.7720   0.535496        0.7253

---
title: "Wine Quality meetup 20/03/2019"
output: html_notebook
---

```{r}
library(tidyverse)
library(caret)
library(doMC)
library(ggcorrplot)
registerDoMC(cores=7)

```
## Load Datasets
```{r}
winedataset_blanco <- read_csv("blanco_train.csv.gz")
winedataset_red <- read_csv("tinto_train.csv.gz")

winedataset_blanco$type="white"
winedataset_red$type="red"

winedataset<-rbind(winedataset_blanco,winedataset_red)

winedataset
winedataset %>% map(is.null)
```
```{r}
winedataset %>% group_by(quality) %>% summarise(total=n())
winedataset %>% group_by(`total sulfur dioxide`,quality)  %>% summarise(total=n())
```
## Exploratory Analysis
### Correlation Matrix
```{r}
#Matriz de correlacion

cor_matrix<-cor(winedataset %>% select(-type))
ggcorrplot(cor_matrix)
```
### Boxplot volatile
```{r}
ggplot(winedataset)+
  geom_boxplot(aes(x=as.factor(quality),y=`volatile acidity`,fill=as.factor(quality)))
  
```
### Boxplot alcohol
```{r}
ggplot(winedataset)+
  geom_boxplot(aes(x=as.factor(quality),y=`alcohol`))
```
## Create categorical features
```{r eval=FALSE, include=FALSE}
trainset<-winedataset %>% mutate(vinegar = ifelse(`volatile acidity`<=0.4,'low',
                                        ifelse(`volatile acidity`>0.4 & `volatile acidity`<=0.8,'medium',
                                        'high'))) %>%
                          mutate(acohol_level = ifelse(`alcohol`<=9,'low',
                                        ifelse(`alcohol`>9 & `alcohol`<=11,'medium',
                                        'high'))) %>%  select(-`residual sugar`,-`fixed acidity`,-`volatile acidity`,-alcohol,-`free sulfur dioxide`)

#trainset<-winedataset %>% select(`volatile acidity`,density,pH,`residual sugar`,`fixed acidity`,`free sulfur dioxide`,quality)

#ggplot(trainset)+
#         geom_point(aes(x=`free sulfur dioxide`,y=`total sulfur dioxide`))
```

## Create category labels for quality
```{r eval=FALSE, include=FALSE}
set.seed(10)

trainset <- winedataset %>% mutate(quality=ifelse(quality==3,'low',
                                   ifelse(quality==4,'low',
                                   ifelse(quality==5,'medium',
                                   ifelse(quality==6,'medium',
                                   ifelse(quality==7,'high','high'
                                   )))))) #%>% filter(quality %in% c('seven','five','six'))

```
## Create clustering labels (dbscan)
```{r eval=FALSE, include=FALSE}
library(dbscan)


dbscan_mod<-dbscan(trainset %>% select(-quality,-type), eps=5, minPts = 10)
trainset <-cbind(trainset,cluster=dbscan_mod$cluster)
trainset %>% group_by(quality,cluster) %>% summarise(n=n()) %>%
  ggplot()+
    geom_col(aes(x=quality,y=n,fill=as.factor(cluster)))

```

## Create clustering labels (kmeans)
```{r eval=FALSE, include=FALSE}


kmeans_mod<-kmeans(trainset %>% select(-quality,-type),centers =8,nstart=40)
trainset <-cbind(trainset,cluster=kmeans_mod$cluster)
trainset %>% group_by(quality,cluster) %>% summarise(n=n()) %>%
  ggplot()+
    geom_col(aes(y=n,x=as.factor(cluster)))+
  facet_wrap(~quality)

trainset
```


## Eliminate type
```{r}

trainset <- trainset %>% select(-type)
```

```{r}
names(trainset)
```
## Split train and test
```{r}

trainIndex <- createDataPartition(as.factor(trainset$quality), p=0.80, list=FALSE)
data_train <- trainset[ trainIndex,]
data_test <-  trainset[-trainIndex,]
colnames(data_train) <- make.names(colnames(data_train))
colnames(data_test) <- make.names(colnames(data_test))


```
### Plot class distribution in train
```{r}
data_train  %>% group_by(quality) %>% summarise(total=n()) %>%
  ggplot()+
  geom_col(aes(x=quality,y=total,fill=quality))+
  theme_classic()

```
### Plot class distribution in test
```{r}
data_test  %>% group_by(quality) %>% summarise(total=n()) %>%
  ggplot()+
  geom_col(aes(x=quality,y=total,fill=quality))+
  theme_classic()
```
## Feature selection
```{r eval=FALSE, include=FALSE}
rfecrtl <- rfeControl(functions=rfFuncs, method="cv", number=10,allowParallel=TRUE)
results <- rfe(quality~. , data=data_train, sizes=c(1:13), rfeControl=rfecrtl)
results
predictors(results)
# plot the results
plot(results, type=c("g", "o"))
```

## Train model
```{r}
ctrl_fast <- trainControl(method="cv", 
                     repeats=1,
                     number=5, 
                   #  summaryFunction=twoClassSummary,
                     verboseIter=T,
                     classProbs=T,
                     allowParallel = TRUE)  
```

```{r}
data_train %>% filter(quality == 'medium') %>% sample_n(size = 100,replace = TRUE) %>% union(data_train)

ctrl_fast$sampling<-"up"

svmGrid <-  expand.grid(sigma= c(0.001,0.0001,0.00001), 
                        C = c(1,2,4,8,16,32,64,80,100,120) 
                        )

#svmGrid <-  expand.grid(C= c(100), sigma = c(1))


train_formula<-formula(quality~.)
rfFitupsam<- train(train_formula,
               data = data_train,
               #method = "rf",   # Radial kernel
               #method = "xgbTree",
               method = "rf",
               #tuneLength = 9,
               #tuneGrid = svmGrid,
               #preProcess=c("scale","center"),
               #metric="ROC",
               #weights = model_weights,
               trControl = ctrl_fast)

#plot(rfFitupsam)
rfFitupsam
#rfFitupsam$finalModel
```
```{r}
importance <- varImp(rfFitupsam, scale=FALSE)
plot(importance)
```
## Test model
```{r}
predsrfprobsamp=predict(rfFitupsam,data_test)
# use for regresion
#confusionMatrix(as.factor(predsrfprobsamp %>% round()),as.factor(data_test$quality))

confusionMatrix(predsrfprobsamp,as.factor(data_test$quality))

```
```{r}
#confusionmat <- table(predsrfprobsamp %>% round(),as.factor(data_test$quality))

confusionmat <- table(predsrfprobsamp,as.factor(data_test$quality))

confusionmat
reshape2::melt(confusionmat) %>%
  ggplot(aes(x=predsrfprobsamp,y=Var2))+
  geom_tile(aes(fill=value), colour = "white") + 
   geom_text(aes(label = sprintf("%1.0f", value)), vjust = 1)+
  scale_fill_gradient(low = "blue", high = "red")+
  xlab(" Predicted Activity ")+ylab(" Actual Activity")+
  scale_y_discrete(limits=c('low','medium','high'))+
  scale_x_discrete(limits=c('high','medium','low'))+
  
  #scale_y_discrete(limits=c('three','six','seven','four','five','eight'))+
  #scale_x_discrete(limits=c('eight','five','four','seven','six','three'))+
  
  theme_bw()+ theme(legend.position = "none")
```

