install.packages(“readxl”) install.packages(“caTools”) install.packages(“corrgram”) install.packages(“knitr”) install.packages(“ggplot2”) install.packages(“dplyr”) install.packages(“tidyr”)

library(ggplot2) library(GGally) library(readxl) library(caTools) library(corrgram) library(knitr) library(ggplot2) library(dplyr) library(tidyr) library(readxl)

library(readxl)
wine_red <- read_excel("wine-red2.xlsx")
View(wine_red)
library(readxl)
wine_white <- read_excel("wine-white2.xlsx")
View(wine_white)
red <- wine_red
white <- wine_white
# add categorical varialbles to both sets
red['color'] <- 'red'
white['color'] <- 'white'
# merge red wine and white wine datasets
data <- rbind(red, white)
head(data)
tail(data)
dim(data)
## [1] 6497   13
names(data)
##  [1] "fixed acidity"        "volatile acidity"     "citric acid"         
##  [4] "residual sugar"       "chlorides"            "free sulfur dioxide" 
##  [7] "total sulfur dioxide" "density"              "pH"                  
## [10] "sulphates"            "alcohol"              "quality"             
## [13] "color"
names(data) <- c('fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides', 'free_sulfurdioxide', 'total_sulfurdioxide', 'density', 'pH','sulphates','alcohol','quality','color')
summary(data)
##  fixed_acidity    volatile_acidity  citric_acid     residual_sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.400   1st Qu.:0.2300   1st Qu.:0.2500   1st Qu.: 1.800  
##  Median : 7.000   Median :0.2900   Median :0.3100   Median : 3.000  
##  Mean   : 7.215   Mean   :0.3397   Mean   :0.3186   Mean   : 5.443  
##  3rd Qu.: 7.700   3rd Qu.:0.4000   3rd Qu.:0.3900   3rd Qu.: 8.100  
##  Max.   :15.900   Max.   :1.5800   Max.   :1.6600   Max.   :65.800  
##    chlorides       free_sulfurdioxide total_sulfurdioxide    density         
##  Min.   :0.00900   Min.   :  1.00     Min.   :  6.0       Min.   :     0.99  
##  1st Qu.:0.03800   1st Qu.: 17.00     1st Qu.: 77.0       1st Qu.:     0.99  
##  Median :0.04700   Median : 29.00     Median :118.0       Median :     0.99  
##  Mean   :0.05603   Mean   : 30.53     Mean   :115.7       Mean   :    96.81  
##  3rd Qu.:0.06500   3rd Qu.: 41.00     3rd Qu.:156.0       3rd Qu.:     1.00  
##  Max.   :0.61100   Max.   :289.00     Max.   :440.0       Max.   :100196.00  
##        pH          sulphates         alcohol             quality     
##  Min.   :2.720   Min.   :0.2200   Min.   :      8.0   Min.   :3.000  
##  1st Qu.:3.110   1st Qu.:0.4300   1st Qu.:      9.5   1st Qu.:5.000  
##  Median :3.210   Median :0.5100   Median :     10.3   Median :6.000  
##  Mean   :3.219   Mean   :0.5313   Mean   :    412.1   Mean   :5.818  
##  3rd Qu.:3.320   3rd Qu.:0.6000   3rd Qu.:     11.3   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :1490646.0   Max.   :9.000  
##     color          
##  Length:6497       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
str(data)
## tibble [6,497 × 13] (S3: tbl_df/tbl/data.frame)
##  $ fixed_acidity      : num [1:6497] 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile_acidity   : num [1:6497] 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric_acid        : num [1:6497] 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual_sugar     : num [1:6497] 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides          : num [1:6497] 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free_sulfurdioxide : num [1:6497] 11 25 15 17 11 13 15 15 9 17 ...
##  $ total_sulfurdioxide: num [1:6497] 34 67 54 60 34 40 59 21 18 102 ...
##  $ density            : num [1:6497] 0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                 : num [1:6497] 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates          : num [1:6497] 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol            : num [1:6497] 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality            : num [1:6497] 5 5 5 6 5 5 5 7 7 5 ...
##  $ color              : chr [1:6497] "red" "red" "red" "red" ...
#data$x <- 1:nrow(data)
#data

#str(data)
data$fixed_acidity <- as.numeric(as.character(data$fixed_acidity))
data$volatile_acidity <- as.numeric(as.character(data$volatile_acidity))
data$citric_acid <- as.numeric(as.character(data$citric_acid))
data$residual_sugar <- as.numeric(as.character(data$residual_sugar))
data$chlorides <- as.numeric(as.character(data$chlorides))
data$free_sulfurdioxide <- as.numeric(as.character(data$free_sulfurdioxide))
data$total_sulfurdioxide <- as.numeric(as.character(data$total_sulfurdioxide))
data$density <- as.numeric(as.character(data$density))
data$pH <- as.numeric(as.character(data$pH))
data$sulphates <- as.numeric(as.character(data$sulphates))
data$alcohol <- as.numeric(as.character(data$alcohol))
data$quality <- as.integer(as.character(data$quality))
# Explaratory Data Aanalysis (EDA) and Pre-Processing
summary(data)
##  fixed_acidity    volatile_acidity  citric_acid     residual_sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.400   1st Qu.:0.2300   1st Qu.:0.2500   1st Qu.: 1.800  
##  Median : 7.000   Median :0.2900   Median :0.3100   Median : 3.000  
##  Mean   : 7.215   Mean   :0.3397   Mean   :0.3186   Mean   : 5.443  
##  3rd Qu.: 7.700   3rd Qu.:0.4000   3rd Qu.:0.3900   3rd Qu.: 8.100  
##  Max.   :15.900   Max.   :1.5800   Max.   :1.6600   Max.   :65.800  
##    chlorides       free_sulfurdioxide total_sulfurdioxide    density         
##  Min.   :0.00900   Min.   :  1.00     Min.   :  6.0       Min.   :     0.99  
##  1st Qu.:0.03800   1st Qu.: 17.00     1st Qu.: 77.0       1st Qu.:     0.99  
##  Median :0.04700   Median : 29.00     Median :118.0       Median :     0.99  
##  Mean   :0.05603   Mean   : 30.53     Mean   :115.7       Mean   :    96.81  
##  3rd Qu.:0.06500   3rd Qu.: 41.00     3rd Qu.:156.0       3rd Qu.:     1.00  
##  Max.   :0.61100   Max.   :289.00     Max.   :440.0       Max.   :100196.00  
##        pH          sulphates         alcohol             quality     
##  Min.   :2.720   Min.   :0.2200   Min.   :      8.0   Min.   :3.000  
##  1st Qu.:3.110   1st Qu.:0.4300   1st Qu.:      9.5   1st Qu.:5.000  
##  Median :3.210   Median :0.5100   Median :     10.3   Median :6.000  
##  Mean   :3.219   Mean   :0.5313   Mean   :    412.1   Mean   :5.818  
##  3rd Qu.:3.320   3rd Qu.:0.6000   3rd Qu.:     11.3   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :1490646.0   Max.   :9.000  
##     color          
##  Length:6497       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
library(corrgram)
corrgram (data, order = TRUE , lower.panel=panel.conf)
## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

# to extract the values of the potential outliers based on the IQR criterion thanks to the boxplot.stats()$out function:
boxplot.stats(data$density)$out
##  [1] 1.00100e+03 1.00100e+03 1.00020e+04 1.00040e+04 1.01030e+00 1.01030e+00
##  [7] 1.00182e+05 1.03898e+00 1.00014e+05 1.00196e+05 1.00037e+05 1.00038e+05
## [13] 1.00038e+05

Slide with : Thanks to the which() function it is possible to extract the row number corresponding to these outliers:

out <- boxplot.stats(data$density)$out
out_ind <- which(data$density %in% c(out))
out_ind
##  [1] 1791 2643 2665 3180 3253 3263 3850 4381 4862 5020 5145 5614 5618
Q <- quantile(data$density, probs=c(.25, .75), na.rm = FALSE)

iqr <- IQR(data$density)

up <-  Q[2]+1.5*iqr # Upper Range  
low<- Q[1]-1.5*iqr # Lower Range

eliminated<- subset(data, 
                    data$density > (Q[1] - 1.5*iqr) & data$density < (Q[2]+1.5*iqr))


eliminated
summary(eliminated)
##  fixed_acidity    volatile_acidity  citric_acid     residual_sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.400   1st Qu.:0.2300   1st Qu.:0.2500   1st Qu.: 1.800  
##  Median : 7.000   Median :0.2900   Median :0.3100   Median : 3.000  
##  Mean   : 7.214   Mean   :0.3397   Mean   :0.3184   Mean   : 5.406  
##  3rd Qu.: 7.700   3rd Qu.:0.4000   3rd Qu.:0.3900   3rd Qu.: 8.100  
##  Max.   :15.900   Max.   :1.5800   Max.   :1.6600   Max.   :26.050  
##    chlorides       free_sulfurdioxide total_sulfurdioxide    density      
##  Min.   :0.00900   Min.   :  1.00     Min.   :  6.0       Min.   :0.9871  
##  1st Qu.:0.03800   1st Qu.: 17.00     1st Qu.: 77.0       1st Qu.:0.9923  
##  Median :0.04700   Median : 29.00     Median :118.0       Median :0.9949  
##  Mean   :0.05604   Mean   : 30.51     Mean   :115.6       Mean   :0.9947  
##  3rd Qu.:0.06500   3rd Qu.: 41.00     3rd Qu.:156.0       3rd Qu.:0.9969  
##  Max.   :0.61100   Max.   :289.00     Max.   :440.0       Max.   :1.0037  
##        pH          sulphates         alcohol             quality     
##  Min.   :2.720   Min.   :0.2200   Min.   :      8.0   Min.   :3.000  
##  1st Qu.:3.110   1st Qu.:0.4300   1st Qu.:      9.5   1st Qu.:5.000  
##  Median :3.210   Median :0.5100   Median :     10.3   Median :6.000  
##  Mean   :3.219   Mean   :0.5313   Mean   :    412.9   Mean   :5.819  
##  3rd Qu.:3.320   3rd Qu.:0.6000   3rd Qu.:     11.3   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :1490646.0   Max.   :9.000  
##     color          
##  Length:6484       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
boxplot(eliminated$alcohol, plot = FALSE)$out
## [1] 1490646 1006404  102411   10050
out <- boxplot.stats(data$alcohol)$out
out_ind <- which(data$alcohol %in% c(out))
out_ind
## [1]  653 1014 5518 6103
outliers1 <- boxplot(eliminated$alcohol, plot=FALSE)$out
df <- eliminated
df <- df[-which(df$alcohol %in% outliers1),]
summary(df)
##  fixed_acidity    volatile_acidity  citric_acid     residual_sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.400   1st Qu.:0.2300   1st Qu.:0.2500   1st Qu.: 1.800  
##  Median : 7.000   Median :0.2900   Median :0.3100   Median : 3.000  
##  Mean   : 7.213   Mean   :0.3396   Mean   :0.3185   Mean   : 5.406  
##  3rd Qu.: 7.700   3rd Qu.:0.4000   3rd Qu.:0.3900   3rd Qu.: 8.100  
##  Max.   :15.600   Max.   :1.5800   Max.   :1.6600   Max.   :26.050  
##    chlorides       free_sulfurdioxide total_sulfurdioxide    density      
##  Min.   :0.00900   Min.   :  1.00     Min.   :  6.0       Min.   :0.9871  
##  1st Qu.:0.03800   1st Qu.: 17.00     1st Qu.: 77.0       1st Qu.:0.9923  
##  Median :0.04700   Median : 29.00     Median :118.0       Median :0.9949  
##  Mean   :0.05603   Mean   : 30.51     Mean   :115.6       Mean   :0.9947  
##  3rd Qu.:0.06500   3rd Qu.: 41.00     3rd Qu.:156.0       3rd Qu.:0.9969  
##  Max.   :0.61100   Max.   :289.00     Max.   :440.0       Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.720   Min.   :0.2200   Min.   : 8.00   Min.   :3.000  
##  1st Qu.:3.110   1st Qu.:0.4300   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.210   Median :0.5100   Median :10.30   Median :6.000  
##  Mean   :3.219   Mean   :0.5313   Mean   :10.49   Mean   :5.818  
##  3rd Qu.:3.320   3rd Qu.:0.6000   3rd Qu.:11.30   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.00   Max.   :9.000  
##     color          
##  Length:6480       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
hist(df$quality)

summary(df$alcohol)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.00    9.50   10.30   10.49   11.30   14.00
which(is.na(df))
## integer(0)
sum(is.na(df))
## [1] 0
df <- na.omit(df)
summary(df)
##  fixed_acidity    volatile_acidity  citric_acid     residual_sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.400   1st Qu.:0.2300   1st Qu.:0.2500   1st Qu.: 1.800  
##  Median : 7.000   Median :0.2900   Median :0.3100   Median : 3.000  
##  Mean   : 7.213   Mean   :0.3396   Mean   :0.3185   Mean   : 5.406  
##  3rd Qu.: 7.700   3rd Qu.:0.4000   3rd Qu.:0.3900   3rd Qu.: 8.100  
##  Max.   :15.600   Max.   :1.5800   Max.   :1.6600   Max.   :26.050  
##    chlorides       free_sulfurdioxide total_sulfurdioxide    density      
##  Min.   :0.00900   Min.   :  1.00     Min.   :  6.0       Min.   :0.9871  
##  1st Qu.:0.03800   1st Qu.: 17.00     1st Qu.: 77.0       1st Qu.:0.9923  
##  Median :0.04700   Median : 29.00     Median :118.0       Median :0.9949  
##  Mean   :0.05603   Mean   : 30.51     Mean   :115.6       Mean   :0.9947  
##  3rd Qu.:0.06500   3rd Qu.: 41.00     3rd Qu.:156.0       3rd Qu.:0.9969  
##  Max.   :0.61100   Max.   :289.00     Max.   :440.0       Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.720   Min.   :0.2200   Min.   : 8.00   Min.   :3.000  
##  1st Qu.:3.110   1st Qu.:0.4300   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.210   Median :0.5100   Median :10.30   Median :6.000  
##  Mean   :3.219   Mean   :0.5313   Mean   :10.49   Mean   :5.818  
##  3rd Qu.:3.320   3rd Qu.:0.6000   3rd Qu.:11.30   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.00   Max.   :9.000  
##     color          
##  Length:6480       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
which(is.na(df))
## integer(0)
sum(is.na(df))
## [1] 0
summary(df$alcohol)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.00    9.50   10.30   10.49   11.30   14.00

Exploratory Data Analysis (EDA) and Data Pre-processing

summary(df)
##  fixed_acidity    volatile_acidity  citric_acid     residual_sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.400   1st Qu.:0.2300   1st Qu.:0.2500   1st Qu.: 1.800  
##  Median : 7.000   Median :0.2900   Median :0.3100   Median : 3.000  
##  Mean   : 7.213   Mean   :0.3396   Mean   :0.3185   Mean   : 5.406  
##  3rd Qu.: 7.700   3rd Qu.:0.4000   3rd Qu.:0.3900   3rd Qu.: 8.100  
##  Max.   :15.600   Max.   :1.5800   Max.   :1.6600   Max.   :26.050  
##    chlorides       free_sulfurdioxide total_sulfurdioxide    density      
##  Min.   :0.00900   Min.   :  1.00     Min.   :  6.0       Min.   :0.9871  
##  1st Qu.:0.03800   1st Qu.: 17.00     1st Qu.: 77.0       1st Qu.:0.9923  
##  Median :0.04700   Median : 29.00     Median :118.0       Median :0.9949  
##  Mean   :0.05603   Mean   : 30.51     Mean   :115.6       Mean   :0.9947  
##  3rd Qu.:0.06500   3rd Qu.: 41.00     3rd Qu.:156.0       3rd Qu.:0.9969  
##  Max.   :0.61100   Max.   :289.00     Max.   :440.0       Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.720   Min.   :0.2200   Min.   : 8.00   Min.   :3.000  
##  1st Qu.:3.110   1st Qu.:0.4300   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.210   Median :0.5100   Median :10.30   Median :6.000  
##  Mean   :3.219   Mean   :0.5313   Mean   :10.49   Mean   :5.818  
##  3rd Qu.:3.320   3rd Qu.:0.6000   3rd Qu.:11.30   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.00   Max.   :9.000  
##     color          
##  Length:6480       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Observations from the Summary Mean residual sugar level is 5.4 g/l. Mean free sulfur dioxide is 30.5 ppm. Max value is 289 which is quite high as 75% is 41 ppm. PH of wine is within range from 2.7 till 4, mean 3.2.

Alcohol: lightest wine is 8%. the highest amount is 14%

Minimum quality mark is 3, mean 5.8, highest is 9.

library(corrgram)
corrgram (df, order = TRUE , lower.panel=panel.conf)
## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

cor(x=df[,1:12], y=df$quality)
##                            [,1]
## fixed_acidity       -0.07549878
## volatile_acidity    -0.26696446
## citric_acid          0.08717157
## residual_sugar      -0.03748329
## chlorides           -0.20059999
## free_sulfurdioxide   0.05553655
## total_sulfurdioxide -0.04154195
## density             -0.31314424
## pH                   0.01895031
## sulphates            0.03844359
## alcohol              0.44541285
## quality              1.00000000
#alcohol: 0,445
#citric acid : 0.087
#free sulfur dioxide: 0.056
#sulphates:0,038
#pH : 0.01895031

#density: -0.313
#volatile_acidity: -0.27
#chlorides: -0.200
#fixed_acidity: -0.076
#total sulfur dioxide: -0.042
#residual_sugar:-0.037


#Quality is mostly correlated with amount of alcohol.

#It looks like the biggest positive correlation with quality has free sulfur dioxide (0.23), sulphates (0.25) and alcohol(0.48). Negative correlation exist for the volatile acidity(-0.39), chlorides(-0.27), total.sulfur.dioxide(-0.19) and density(-0.17). 
summary(df$quality)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.818   6.000   9.000
table(df$ quality)
## 
##    3    4    5    6    7    8    9 
##   30  216 2132 2828 1076  193    5
library(ggplot2)
qplot(quality, data = df, fill = color, binwidth = 1) +
  scale_x_continuous(breaks = seq(3,10,1), lim = c(3,10)) 
## Warning: Removed 4 rows containing missing values (geom_bar).

summary(df$alcohol)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.00    9.50   10.30   10.49   11.30   14.00
qplot(alcohol, data=df, fill =color, binwidth = 0.5)+ scale_x_continuous(breaks = seq(8,15,0.5), lim = c(8,15))
## Warning: Removed 4 rows containing missing values (geom_bar).

qplot(quality, data =df, binwidth = 1, color = color, geom = "density") + 
    scale_x_continuous(breaks = seq(3, 9, 1))
## Warning: Ignoring unknown parameters: binwidth

ffixed_acidity <- as.factor(as.numeric(data$fixed_acidity))
fvolatile_acidity <- as.factor(as.numeric(data$volatile_acidity))
fcitric_acid <- as.factor(as.numeric(data$citric_acid))
fresidual_sugar <- as.factor(as.numeric(data$residual_sugar))
fchlorides <- as.factor(as.numeric(data$chlorides))
ffree_sulfurdioxide <- as.factor(as.numeric(data$free_sulfurdioxide))
ftotal_sulfurdioxide <- as.factor(as.numeric(data$total_sulfurdioxide))
fdensity <- as.factor(as.numeric(data$density))
fpH <- as.factor(as.numeric(data$pH))
fsulphates <- as.factor(as.numeric(data$sulphates))
falcohol <- as.factor(as.numeric(data$alcohol))
fquality <- as.factor(as.integer(data$quality))
quality_min <- min(df$quality)
quality_max <- max(df$quality)
quality_mean <- mean(df$quality)
quality_median <- median(df$quality)
quality_iqr <- IQR(df$quality)
quality_q1 <- quality_median - quality_iqr
quality_q3 <- quality_median + quality_iqr

alcohol_mean <- mean(df$alcohol)

#Visualization of Wine Quality

ggplot(df, aes(x=quality, fill = quality)) +
  geom_bar(stat="count") +
  geom_text(position = "stack", stat='count',aes(label=..count..), vjust = -0.5)+
  labs(y="Num of Observations", x="Wine Quality") +
  labs(title= "Distribution of Wine Quality Ratings")

###We can see quality is not balanced across its entire range of 0-10. Most of the numbers are around 5 or 6. In other words, there are much more normal wines than very excellent or poor ones.

#df%>% 
#  gather(df$quality, key = "var", value = "value") %>% 
#  ggplot(aes(x = quality, y = value, color = quality)) +
#    geom_boxplot() +
#    facet_wrap(~ var, scales = "free", ncol = 3)+
#    theme(legend.position="none")
wine2 = scale(df [,1:12])
wine_scaled= cbind.data.frame(wine2 , df$quality)
colnames(wine_scaled) = colnames(df)
head(wine_scaled)
#install.packages("FactoMineR")
#install.packages("factoextra")

library(FactoMineR)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
set.seed(150)

res = PCA( wine_scaled , quali.sup=12 ,graph = FALSE )
fviz_pca_biplot(res ,geom.ind = c("point"),habillage = 12 , col.var = "black" , repel=TRUE)

library(caTools)
set.seed(150)
split = sample.split(df$quality, SplitRatio = 0.8)
training_set = subset(df, split == TRUE)
test_set = subset(df, split == FALSE)
#Logistic Regression

#log_reg <- glm(df$quality ~ df$density, #data=training_set, family = binomial)

#summary(log_reg)
#Decision Tree Regression

#library(rpart)
#classifier = rpart(formula = quality ~ .,
#                   data=training_set)
#y_pred = predict(classifier, newdata = test_set[-12],type="vector")


#cm = as.matrix(table(actual=test_set$quality,predicted=y_pred))
#cm
#RANDOM FOREST
#library(randomForest)

#set.seed(150)
#classifier1 = randomForest(x = training_set[-12],
#                           y = training_set$quality,
#                           ntree = 500)
#redicting the Test set results

#y_pred1 = predict(classifier1, newdata = test_set[-12])

#summary(y_pred1)
# the Confusion Matrix
#cm1 = table(actual=test_set[, 12],predicted= y_pred1)
#summary(cm1)