library(tidyverse)
library(caret)
library(doMC)
library(ggcorrplot)
registerDoMC(cores=7)
Load Datasets
winedataset_blanco <- read_csv("blanco_train.csv.gz")
Parsed with column specification:
cols(
`fixed acidity` = col_double(),
`volatile acidity` = col_double(),
`citric acid` = col_double(),
`residual sugar` = col_double(),
chlorides = col_double(),
`free sulfur dioxide` = col_double(),
`total sulfur dioxide` = col_double(),
density = col_double(),
pH = col_double(),
sulphates = col_double(),
alcohol = col_double(),
quality = col_integer()
)
winedataset_red <- read_csv("tinto_train.csv.gz")
Parsed with column specification:
cols(
`fixed acidity` = col_double(),
`volatile acidity` = col_double(),
`citric acid` = col_double(),
`residual sugar` = col_double(),
chlorides = col_double(),
`free sulfur dioxide` = col_double(),
`total sulfur dioxide` = col_double(),
density = col_double(),
pH = col_double(),
sulphates = col_double(),
alcohol = col_double(),
quality = col_integer()
)
winedataset_blanco$type="white"
winedataset_red$type="red"
winedataset<-rbind(winedataset_blanco,winedataset_red)
winedataset
winedataset %>% map(is.null)
$`fixed acidity`
[1] FALSE
$`volatile acidity`
[1] FALSE
$`citric acid`
[1] FALSE
$`residual sugar`
[1] FALSE
$chlorides
[1] FALSE
$`free sulfur dioxide`
[1] FALSE
$`total sulfur dioxide`
[1] FALSE
$density
[1] FALSE
$pH
[1] FALSE
$sulphates
[1] FALSE
$alcohol
[1] FALSE
$quality
[1] FALSE
$type
[1] FALSE
winedataset %>% group_by(quality) %>% summarise(total=n())
winedataset %>% group_by(`total sulfur dioxide`,quality) %>% summarise(total=n())
Exploratory Analysis
Correlation Matrix
#Matriz de correlacion
cor_matrix<-cor(winedataset %>% select(-type))
ggcorrplot(cor_matrix)

Boxplot volatile
ggplot(winedataset)+
geom_boxplot(aes(x=as.factor(quality),y=`volatile acidity`,fill=as.factor(quality)))

NA
Boxplot alcohol
ggplot(winedataset)+
geom_boxplot(aes(x=as.factor(quality),y=`alcohol`))
Create categorical features
Create category labels for quality
Create clustering labels (dbscan)
Create clustering labels (kmeans)
Eliminate type
trainset <- trainset %>% select(-type)
names(trainset)
[1] "fixed acidity" "volatile acidity" "citric acid" "residual sugar" "chlorides"
[6] "free sulfur dioxide" "total sulfur dioxide" "density" "pH" "sulphates"
[11] "alcohol" "quality" "cluster"
Split train and test
trainIndex <- createDataPartition(as.factor(trainset$quality), p=0.80, list=FALSE)
data_train <- trainset[ trainIndex,]
data_test <- trainset[-trainIndex,]
colnames(data_train) <- make.names(colnames(data_train))
colnames(data_test) <- make.names(colnames(data_test))
Plot class distribution in train
data_train %>% group_by(quality) %>% summarise(total=n()) %>%
ggplot()+
geom_col(aes(x=quality,y=total,fill=quality))+
theme_classic()

Plot class distribution in test
data_test %>% group_by(quality) %>% summarise(total=n()) %>%
ggplot()+
geom_col(aes(x=quality,y=total,fill=quality))+
theme_classic()

Feature selection
Train model
ctrl_fast <- trainControl(method="cv",
repeats=1,
number=5,
# summaryFunction=twoClassSummary,
verboseIter=T,
classProbs=T,
allowParallel = TRUE)
`repeats` has no meaning for this resampling method.
importance <- varImp(rfFitupsam, scale=FALSE)
plot(importance)

Test model
predsrfprobsamp=predict(rfFitupsam,data_test)
# use for regresion
#confusionMatrix(as.factor(predsrfprobsamp %>% round()),as.factor(data_test$quality))
confusionMatrix(predsrfprobsamp,as.factor(data_test$quality))
Confusion Matrix and Statistics
Reference
Prediction high low medium
high 122 0 45
low 0 3 4
medium 82 37 745
Overall Statistics
Accuracy : 0.8382
95% CI : (0.8143, 0.8601)
No Information Rate : 0.7649
P-Value [Acc > NIR] : 4.354e-09
Kappa : 0.5116
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: high Class: low Class: medium
Sensitivity 0.5980 0.075000 0.9383
Specificity 0.9460 0.995992 0.5123
Pos Pred Value 0.7305 0.428571 0.8623
Neg Pred Value 0.9059 0.964113 0.7184
Prevalence 0.1965 0.038536 0.7649
Detection Rate 0.1175 0.002890 0.7177
Detection Prevalence 0.1609 0.006744 0.8324
Balanced Accuracy 0.7720 0.535496 0.7253

---
title: "Wine Quality meetup 20/03/2019"
output: html_notebook
---

```{r}
library(tidyverse)
library(caret)
library(doMC)
library(ggcorrplot)
registerDoMC(cores=7)

```
## Load Datasets
```{r}
winedataset_blanco <- read_csv("blanco_train.csv.gz")
winedataset_red <- read_csv("tinto_train.csv.gz")

winedataset_blanco$type="white"
winedataset_red$type="red"

winedataset<-rbind(winedataset_blanco,winedataset_red)

winedataset
winedataset %>% map(is.null)
```
```{r}
winedataset %>% group_by(quality) %>% summarise(total=n())
winedataset %>% group_by(`total sulfur dioxide`,quality)  %>% summarise(total=n())
```
## Exploratory Analysis
### Correlation Matrix
```{r}
#Matriz de correlacion

cor_matrix<-cor(winedataset %>% select(-type))
ggcorrplot(cor_matrix)
```
### Boxplot volatile
```{r}
ggplot(winedataset)+
  geom_boxplot(aes(x=as.factor(quality),y=`volatile acidity`,fill=as.factor(quality)))
  
```
### Boxplot alcohol
```{r}
ggplot(winedataset)+
  geom_boxplot(aes(x=as.factor(quality),y=`alcohol`))
```
## Create categorical features
```{r eval=FALSE, include=FALSE}
trainset<-winedataset %>% mutate(vinegar = ifelse(`volatile acidity`<=0.4,'low',
                                        ifelse(`volatile acidity`>0.4 & `volatile acidity`<=0.8,'medium',
                                        'high'))) %>%
                          mutate(acohol_level = ifelse(`alcohol`<=9,'low',
                                        ifelse(`alcohol`>9 & `alcohol`<=11,'medium',
                                        'high'))) %>%  select(-`residual sugar`,-`fixed acidity`,-`volatile acidity`,-alcohol,-`free sulfur dioxide`)

#trainset<-winedataset %>% select(`volatile acidity`,density,pH,`residual sugar`,`fixed acidity`,`free sulfur dioxide`,quality)

#ggplot(trainset)+
#         geom_point(aes(x=`free sulfur dioxide`,y=`total sulfur dioxide`))
```

## Create category labels for quality
```{r eval=FALSE, include=FALSE}
set.seed(10)

trainset <- winedataset %>% mutate(quality=ifelse(quality==3,'low',
                                   ifelse(quality==4,'low',
                                   ifelse(quality==5,'medium',
                                   ifelse(quality==6,'medium',
                                   ifelse(quality==7,'high','high'
                                   )))))) #%>% filter(quality %in% c('seven','five','six'))

```
## Create clustering labels (dbscan)
```{r eval=FALSE, include=FALSE}
library(dbscan)


dbscan_mod<-dbscan(trainset %>% select(-quality,-type), eps=5, minPts = 10)
trainset <-cbind(trainset,cluster=dbscan_mod$cluster)
trainset %>% group_by(quality,cluster) %>% summarise(n=n()) %>%
  ggplot()+
    geom_col(aes(x=quality,y=n,fill=as.factor(cluster)))

```

## Create clustering labels (kmeans)
```{r eval=FALSE, include=FALSE}


kmeans_mod<-kmeans(trainset %>% select(-quality,-type),centers =8,nstart=40)
trainset <-cbind(trainset,cluster=kmeans_mod$cluster)
trainset %>% group_by(quality,cluster) %>% summarise(n=n()) %>%
  ggplot()+
    geom_col(aes(y=n,x=as.factor(cluster)))+
  facet_wrap(~quality)

trainset
```


## Eliminate type
```{r}

trainset <- trainset %>% select(-type)
```

```{r}
names(trainset)
```
## Split train and test
```{r}

trainIndex <- createDataPartition(as.factor(trainset$quality), p=0.80, list=FALSE)
data_train <- trainset[ trainIndex,]
data_test <-  trainset[-trainIndex,]
colnames(data_train) <- make.names(colnames(data_train))
colnames(data_test) <- make.names(colnames(data_test))


```
### Plot class distribution in train
```{r}
data_train  %>% group_by(quality) %>% summarise(total=n()) %>%
  ggplot()+
  geom_col(aes(x=quality,y=total,fill=quality))+
  theme_classic()

```
### Plot class distribution in test
```{r}
data_test  %>% group_by(quality) %>% summarise(total=n()) %>%
  ggplot()+
  geom_col(aes(x=quality,y=total,fill=quality))+
  theme_classic()
```
## Feature selection
```{r eval=FALSE, include=FALSE}
rfecrtl <- rfeControl(functions=rfFuncs, method="cv", number=10,allowParallel=TRUE)
results <- rfe(quality~. , data=data_train, sizes=c(1:13), rfeControl=rfecrtl)
results
predictors(results)
# plot the results
plot(results, type=c("g", "o"))
```

## Train model
```{r}
ctrl_fast <- trainControl(method="cv", 
                     repeats=1,
                     number=5, 
                   #  summaryFunction=twoClassSummary,
                     verboseIter=T,
                     classProbs=T,
                     allowParallel = TRUE)  
```

```{r}
data_train %>% filter(quality == 'medium') %>% sample_n(size = 100,replace = TRUE) %>% union(data_train)

ctrl_fast$sampling<-"up"

svmGrid <-  expand.grid(sigma= c(0.001,0.0001,0.00001), 
                        C = c(1,2,4,8,16,32,64,80,100,120) 
                        )

#svmGrid <-  expand.grid(C= c(100), sigma = c(1))


train_formula<-formula(quality~.)
rfFitupsam<- train(train_formula,
               data = data_train,
               #method = "rf",   # Radial kernel
               #method = "xgbTree",
               method = "rf",
               #tuneLength = 9,
               #tuneGrid = svmGrid,
               #preProcess=c("scale","center"),
               #metric="ROC",
               #weights = model_weights,
               trControl = ctrl_fast)

#plot(rfFitupsam)
rfFitupsam
#rfFitupsam$finalModel
```
```{r}
importance <- varImp(rfFitupsam, scale=FALSE)
plot(importance)
```
## Test model
```{r}
predsrfprobsamp=predict(rfFitupsam,data_test)
# use for regresion
#confusionMatrix(as.factor(predsrfprobsamp %>% round()),as.factor(data_test$quality))

confusionMatrix(predsrfprobsamp,as.factor(data_test$quality))

```
```{r}
#confusionmat <- table(predsrfprobsamp %>% round(),as.factor(data_test$quality))

confusionmat <- table(predsrfprobsamp,as.factor(data_test$quality))

confusionmat
reshape2::melt(confusionmat) %>%
  ggplot(aes(x=predsrfprobsamp,y=Var2))+
  geom_tile(aes(fill=value), colour = "white") + 
   geom_text(aes(label = sprintf("%1.0f", value)), vjust = 1)+
  scale_fill_gradient(low = "blue", high = "red")+
  xlab(" Predicted Activity ")+ylab(" Actual Activity")+
  scale_y_discrete(limits=c('low','medium','high'))+
  scale_x_discrete(limits=c('high','medium','low'))+
  
  #scale_y_discrete(limits=c('three','six','seven','four','five','eight'))+
  #scale_x_discrete(limits=c('eight','five','four','seven','six','three'))+
  
  theme_bw()+ theme(legend.position = "none")
```

