library(ggplot2) library(GGally)

library(readxl)
wine_red <- read_excel("C:/Users/PC/Desktop/ads542-project/wine-red2.xlsx")
View(wine_red)
library(readxl)
wine_white <- read_excel("C:/Users/PC/Desktop/ads542-project/wine-white2.xlsx")
View(wine_white)
red <- wine_red
white <- wine_white
# add categorical varialbles to both sets
red['color'] <- 'red'
white['color'] <- 'white'
# merge red wine and white wine datasets
data <- rbind(red, white)
head(data)
## # A tibble: 6 x 13
##   `fixed acidity` `volatile acidity` `citric acid` `residual sugar` chlorides
##             <dbl>              <dbl>         <dbl>            <dbl>     <dbl>
## 1             7.4               0.7           0                 1.9     0.076
## 2             7.8               0.88          0                 2.6     0.098
## 3             7.8               0.76          0.04              2.3     0.092
## 4            11.2               0.28          0.56              1.9     0.075
## 5             7.4               0.7           0                 1.9     0.076
## 6             7.4               0.66          0                 1.8     0.075
## # ... with 8 more variables: free sulfur dioxide <dbl>,
## #   total sulfur dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## #   alcohol <dbl>, quality <dbl>, color <chr>
tail(data)
## # A tibble: 6 x 13
##   `fixed acidity` `volatile acidity` `citric acid` `residual sugar` chlorides
##             <dbl>              <dbl>         <dbl>            <dbl>     <dbl>
## 1             6.5               0.23          0.38              1.3     0.032
## 2             6.2               0.21          0.29              1.6     0.039
## 3             6.6               0.32          0.36              8       0.047
## 4             6.5               0.24          0.19              1.2     0.041
## 5             5.5               0.29          0.3               1.1     0.022
## 6             6                 0.21          0.38              0.8     0.02 
## # ... with 8 more variables: free sulfur dioxide <dbl>,
## #   total sulfur dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## #   alcohol <dbl>, quality <dbl>, color <chr>
dim(data)
## [1] 6497   13
names(data)
##  [1] "fixed acidity"        "volatile acidity"     "citric acid"         
##  [4] "residual sugar"       "chlorides"            "free sulfur dioxide" 
##  [7] "total sulfur dioxide" "density"              "pH"                  
## [10] "sulphates"            "alcohol"              "quality"             
## [13] "color"
names(data) <- c('fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH','sulphates','alcohol','quality','color')
summary(data)
##  fixed_acidity    volatile_acidity    citric_acid     residual_sugar  
##  Min.   : 3.800   Min.   :   0.080   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.400   1st Qu.:   0.230   1st Qu.:0.2500   1st Qu.: 1.800  
##  Median : 7.000   Median :   0.290   Median :0.3100   Median : 3.000  
##  Mean   : 7.215   Mean   :   1.319   Mean   :0.3186   Mean   : 5.443  
##  3rd Qu.: 7.700   3rd Qu.:   0.400   3rd Qu.:0.3900   3rd Qu.: 8.100  
##  Max.   :15.900   Max.   :1185.000   Max.   :1.6600   Max.   :65.800  
##    chlorides       free sulfur dioxide total sulfur dioxide    density         
##  Min.   :0.00900   Min.   :  1.00      Min.   :  6.0        Min.   :     0.99  
##  1st Qu.:0.03800   1st Qu.: 17.00      1st Qu.: 77.0        1st Qu.:     0.99  
##  Median :0.04700   Median : 29.00      Median :118.0        Median :     0.99  
##  Mean   :0.05603   Mean   : 30.53      Mean   :115.7        Mean   :   739.66  
##  3rd Qu.:0.06500   3rd Qu.: 41.00      3rd Qu.:156.0        3rd Qu.:     1.00  
##  Max.   :0.61100   Max.   :289.00      Max.   :440.0        Max.   :103898.00  
##        pH          sulphates         alcohol             quality     
##  Min.   :2.720   Min.   :0.2200   Min.   :8.000e+00   Min.   :3.000  
##  1st Qu.:3.110   1st Qu.:0.4300   1st Qu.:1.000e+01   1st Qu.:5.000  
##  Median :3.210   Median :0.5100   Median :1.000e+01   Median :6.000  
##  Mean   :3.219   Mean   :0.5313   Mean   :1.733e+12   Mean   :5.818  
##  3rd Qu.:3.320   3rd Qu.:0.6000   3rd Qu.:1.100e+01   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :9.733e+14   Max.   :9.000  
##     color          
##  Length:6497       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
str(data)
## tibble [6,497 x 13] (S3: tbl_df/tbl/data.frame)
##  $ fixed_acidity       : num [1:6497] 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile_acidity    : num [1:6497] 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric_acid         : num [1:6497] 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual_sugar      : num [1:6497] 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num [1:6497] 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free sulfur dioxide : num [1:6497] 11 25 15 17 11 13 15 15 9 17 ...
##  $ total sulfur dioxide: num [1:6497] 34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num [1:6497] 0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num [1:6497] 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num [1:6497] 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num [1:6497] 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality             : num [1:6497] 5 5 5 6 5 5 5 7 7 5 ...
##  $ color               : chr [1:6497] "red" "red" "red" "red" ...
data$x <- 1:nrow(data)
data
## # A tibble: 6,497 x 14
##    fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
##            <dbl>            <dbl>       <dbl>          <dbl>     <dbl>
##  1           7.4             0.7         0               1.9     0.076
##  2           7.8             0.88        0               2.6     0.098
##  3           7.8             0.76        0.04            2.3     0.092
##  4          11.2             0.28        0.56            1.9     0.075
##  5           7.4             0.7         0               1.9     0.076
##  6           7.4             0.66        0               1.8     0.075
##  7           7.9             0.6         0.06            1.6     0.069
##  8           7.3             0.65        0               1.2     0.065
##  9           7.8             0.58        0.02            2       0.073
## 10           7.5             0.5         0.36            6.1     0.071
## # ... with 6,487 more rows, and 9 more variables: free sulfur dioxide <dbl>,
## #   total sulfur dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## #   alcohol <dbl>, quality <dbl>, color <chr>, x <int>
str(data)
## tibble [6,497 x 14] (S3: tbl_df/tbl/data.frame)
##  $ fixed_acidity       : num [1:6497] 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile_acidity    : num [1:6497] 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric_acid         : num [1:6497] 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual_sugar      : num [1:6497] 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num [1:6497] 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free sulfur dioxide : num [1:6497] 11 25 15 17 11 13 15 15 9 17 ...
##  $ total sulfur dioxide: num [1:6497] 34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num [1:6497] 0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num [1:6497] 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num [1:6497] 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num [1:6497] 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality             : num [1:6497] 5 5 5 6 5 5 5 7 7 5 ...
##  $ color               : chr [1:6497] "red" "red" "red" "red" ...
##  $ x                   : int [1:6497] 1 2 3 4 5 6 7 8 9 10 ...
data$fixed_acidity <- as.numeric(as.character(data$fixed_acidity))

data$volatile_acidity <- as.numeric(as.character(data$volatile_acidity))

data$citric_acid <- as.numeric(as.character(data$citric_acid))

data$residual_sugar <- as.numeric(as.character(data$residual_sugar))

data$chlorides <- as.numeric(as.character(data$chlorides))

data$`free sulfur dioxide` <- as.numeric(as.character(data$`free sulfur dioxide`))


data$`total sulfur dioxide` <- as.numeric(as.character(data$`total sulfur dioxide`))


data$density <- as.numeric(as.character(data$density))

data$pH <- as.numeric(as.character(data$pH))

data$sulphates <- as.numeric(as.character(data$sulphates))

data$alcohol <- as.numeric(as.character(data$alcohol))


data$quality <- as.integer(as.character(data$quality))

Explaratory Data Aanalysis (EDA) and Pre-Processing

summary(data)
##  fixed_acidity    volatile_acidity    citric_acid     residual_sugar  
##  Min.   : 3.800   Min.   :   0.080   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.400   1st Qu.:   0.230   1st Qu.:0.2500   1st Qu.: 1.800  
##  Median : 7.000   Median :   0.290   Median :0.3100   Median : 3.000  
##  Mean   : 7.215   Mean   :   1.319   Mean   :0.3186   Mean   : 5.443  
##  3rd Qu.: 7.700   3rd Qu.:   0.400   3rd Qu.:0.3900   3rd Qu.: 8.100  
##  Max.   :15.900   Max.   :1185.000   Max.   :1.6600   Max.   :65.800  
##    chlorides       free sulfur dioxide total sulfur dioxide    density         
##  Min.   :0.00900   Min.   :  1.00      Min.   :  6.0        Min.   :     0.99  
##  1st Qu.:0.03800   1st Qu.: 17.00      1st Qu.: 77.0        1st Qu.:     0.99  
##  Median :0.04700   Median : 29.00      Median :118.0        Median :     0.99  
##  Mean   :0.05603   Mean   : 30.53      Mean   :115.7        Mean   :   739.66  
##  3rd Qu.:0.06500   3rd Qu.: 41.00      3rd Qu.:156.0        3rd Qu.:     1.00  
##  Max.   :0.61100   Max.   :289.00      Max.   :440.0        Max.   :103898.00  
##        pH          sulphates         alcohol             quality     
##  Min.   :2.720   Min.   :0.2200   Min.   :8.000e+00   Min.   :3.000  
##  1st Qu.:3.110   1st Qu.:0.4300   1st Qu.:1.000e+01   1st Qu.:5.000  
##  Median :3.210   Median :0.5100   Median :1.000e+01   Median :6.000  
##  Mean   :3.219   Mean   :0.5313   Mean   :1.733e+12   Mean   :5.818  
##  3rd Qu.:3.320   3rd Qu.:0.6000   3rd Qu.:1.100e+01   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :9.733e+14   Max.   :9.000  
##     color                 x       
##  Length:6497        Min.   :   1  
##  Class :character   1st Qu.:1625  
##  Mode  :character   Median :3249  
##                     Mean   :3249  
##                     3rd Qu.:4873  
##                     Max.   :6497

to extract the values of the potential outliers based on the IQR criterion thanks to the boxplot.stats()$out function:

boxplot.stats(data$density)$out
##   [1] 100005 100005 100025 100015 100015   1001  10014  10001  10015  10015
##  [11]  10002  10008  10006  10004  10018  10001   1001  10022  10022  10014
##  [21]  10003  10014  10014  10004  10014  10004  10004  10032  10008  10006
##  [31]  10026  10002  10002  10002   1001   1001  10004  10004  10014  10008
##  [41] 100315 100315  10002 100315  10002  10021  10021  10003  10002  10002
##  [51]  10002  10004  10006  10006  10026  10006  10006   1001   1001  10004
##  [61]  10004  10001  10002 100024  10001 100012 100289 100369 100369 100242
##  [71] 100242   1001  10002   1001   1001  10002 100055  10006  10006   1001
##  [81]  10002  10002  10004  10006  10003  10003  10003  10004  10001  10005
##  [91]  10012  10004  10004  10024  10001  10103  10103  10004  10008  10002
## [101]  10008  10008  10007  10001  10001  10017  10017  10011  10011  10006
## [111]  10004  10004  10004  10002  10004  10001  10001  10001  10005  10001
## [121]  10001  10001  10001 100182 100047 100241 100098 100016 100051 100118
## [131] 100014  10002 100013 100013 103898 100014 100196 100037 100037 100295
## [141] 100295 100044 100044 100022 100038 100038

#Thanks to the which() function it is possible to extract the row number corresponding to these outliers:

out <- boxplot.stats(data$density)$out
out_ind <- which(data$density %in% c(out))
out_ind
##   [1]  244  245  267  285  286  290  295  296  325  326  329  339  345  351  354
##  [16]  357  360  365  367  375  377  382  392  396  416  434  436  443  460  466
##  [31]  481  489  494  500  516  517  532  533  539  545  555  556  557  558  559
##  [46]  560  565  571  581  582  585  594  602  604  609  612  619  634  652  656
##  [61]  681  699  738  744  745  812  890 1435 1436 1475 1477 1607 1614 1782 1791
##  [76] 2317 2362 2379 2383 2643 2665 2671 2678 2690 2867 2870 2876 2932 3017 3036
##  [91] 3054 3087 3180 3208 3234 3253 3263 3264 3280 3393 3407 3409 3448 3540 3542
## [106] 3558 3563 3573 3574 3585 3595 3597 3598 3599 3606 3651 3652 3656 3706 3710
## [121] 3711 3712 3715 3850 3869 3934 3978 3996 4011 4019 4034 4220 4234 4237 4381
## [136] 4862 5020 5145 5147 5219 5223 5278 5294 5330 5614 5618
Q <- quantile(data$density, probs=c(.25, .75), na.rm = FALSE)

iqr <- IQR(data$density)

up <-  Q[2]+1.5*iqr # Upper Range  
low<- Q[1]-1.5*iqr # Lower Range

eliminated<- subset(data, 
        data$density > (Q[1] - 1.5*iqr) & data$density < (Q[2]+1.5*iqr))


eliminated
## # A tibble: 6,351 x 14
##    fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
##            <dbl>            <dbl>       <dbl>          <dbl>     <dbl>
##  1           7.4             0.7         0               1.9     0.076
##  2           7.8             0.88        0               2.6     0.098
##  3           7.8             0.76        0.04            2.3     0.092
##  4          11.2             0.28        0.56            1.9     0.075
##  5           7.4             0.7         0               1.9     0.076
##  6           7.4             0.66        0               1.8     0.075
##  7           7.9             0.6         0.06            1.6     0.069
##  8           7.3             0.65        0               1.2     0.065
##  9           7.8             0.58        0.02            2       0.073
## 10           7.5             0.5         0.36            6.1     0.071
## # ... with 6,341 more rows, and 9 more variables: free sulfur dioxide <dbl>,
## #   total sulfur dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## #   alcohol <dbl>, quality <int>, color <chr>, x <int>
summary(eliminated)
##  fixed_acidity    volatile_acidity    citric_acid     residual_sugar  
##  Min.   : 3.800   Min.   :   0.080   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.400   1st Qu.:   0.230   1st Qu.:0.2400   1st Qu.: 1.800  
##  Median : 7.000   Median :   0.290   Median :0.3100   Median : 2.900  
##  Mean   : 7.159   Mean   :   1.341   Mean   :0.3158   Mean   : 5.301  
##  3rd Qu.: 7.600   3rd Qu.:   0.400   3rd Qu.:0.3900   3rd Qu.: 8.000  
##  Max.   :15.900   Max.   :1185.000   Max.   :1.6600   Max.   :22.600  
##    chlorides       free sulfur dioxide total sulfur dioxide    density      
##  Min.   :0.00900   Min.   :  1.00      Min.   :  6.0        Min.   :0.9871  
##  1st Qu.:0.03800   1st Qu.: 17.00      1st Qu.: 78.0        1st Qu.:0.9923  
##  Median :0.04700   Median : 29.00      Median :118.0        Median :0.9948  
##  Mean   :0.05562   Mean   : 30.51      Mean   :115.6        Mean   :0.9945  
##  3rd Qu.:0.06400   3rd Qu.: 41.00      3rd Qu.:155.0        3rd Qu.:0.9968  
##  Max.   :0.61100   Max.   :289.00      Max.   :440.0        Max.   :1.0010  
##        pH         sulphates         alcohol             quality     
##  Min.   :2.72   Min.   :0.2200   Min.   :8.000e+00   Min.   :3.000  
##  1st Qu.:3.11   1st Qu.:0.4300   1st Qu.:1.000e+01   1st Qu.:5.000  
##  Median :3.21   Median :0.5000   Median :1.000e+01   Median :6.000  
##  Mean   :3.22   Mean   :0.5289   Mean   :1.773e+12   Mean   :5.824  
##  3rd Qu.:3.32   3rd Qu.:0.6000   3rd Qu.:1.100e+01   3rd Qu.:6.000  
##  Max.   :4.01   Max.   :2.0000   Max.   :9.733e+14   Max.   :9.000  
##     color                 x       
##  Length:6351        Min.   :   1  
##  Class :character   1st Qu.:1662  
##  Mode  :character   Median :3274  
##                     Mean   :3275  
##                     3rd Qu.:4900  
##                     Max.   :6497
boxplot(eliminated$alcohol, plot = FALSE)$out
##  [1] 1.490000e+01 1.003333e+14 1.003333e+14 1.106667e+14 9.566667e+14
##  [6] 1.356667e+14 9.233333e+14 1.420000e+01 1.289333e+14 1.289333e+14
## [11] 1.146667e+14 1.003333e+14 1.143333e+14 1.053333e+14 9.533333e+14
## [16] 1.093333e+14 1.093333e+14 1.136667e+14 1.133333e+14 1.106667e+14
## [21] 1.133333e+14 9.733333e+14 9.733333e+14 1.133333e+14 1.405000e+01
## [26] 1.233333e+14 1.126667e+14 1.056667e+14 1.173333e+14 1.096667e+14
## [31] 1.096667e+14 1.096667e+14 1.013333e+14 1.013333e+14 1.046667e+14
## [36] 1.046667e+14 1.163333e+14 1.163333e+14 1.313333e+14 1.206667e+14
## [41] 9.633333e+14 9.533333e+14 9.533333e+14
outliers1 <- boxplot(eliminated$alcohol, plot=FALSE)$out
df <- eliminated
df <- df[-which(df$alcohol %in% outliers1),]
summary(df)
##  fixed_acidity    volatile_acidity    citric_acid    residual_sugar  
##  Min.   : 3.800   Min.   :   0.080   Min.   :0.000   Min.   : 0.600  
##  1st Qu.: 6.400   1st Qu.:   0.230   1st Qu.:0.240   1st Qu.: 1.800  
##  Median : 7.000   Median :   0.290   Median :0.310   Median : 2.900  
##  Mean   : 7.162   Mean   :   1.347   Mean   :0.316   Mean   : 5.303  
##  3rd Qu.: 7.600   3rd Qu.:   0.400   3rd Qu.:0.390   3rd Qu.: 8.000  
##  Max.   :14.200   Max.   :1185.000   Max.   :1.660   Max.   :22.600  
##    chlorides       free sulfur dioxide total sulfur dioxide    density      
##  Min.   :0.00900   Min.   :  1.00      Min.   :  6.0        Min.   :0.9871  
##  1st Qu.:0.03800   1st Qu.: 17.00      1st Qu.: 78.0        1st Qu.:0.9923  
##  Median :0.04700   Median : 29.00      Median :118.0        Median :0.9948  
##  Mean   :0.05566   Mean   : 30.48      Mean   :115.6        Mean   :0.9946  
##  3rd Qu.:0.06400   3rd Qu.: 41.00      3rd Qu.:155.0        3rd Qu.:0.9968  
##  Max.   :0.61100   Max.   :289.00      Max.   :440.0        Max.   :1.0010  
##        pH         sulphates         alcohol         quality     
##  Min.   :2.72   Min.   :0.2200   Min.   : 8.00   Min.   :3.000  
##  1st Qu.:3.11   1st Qu.:0.4300   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.21   Median :0.5000   Median :10.30   Median :6.000  
##  Mean   :3.22   Mean   :0.5287   Mean   :10.51   Mean   :5.823  
##  3rd Qu.:3.32   3rd Qu.:0.6000   3rd Qu.:11.30   3rd Qu.:6.000  
##  Max.   :4.01   Max.   :2.0000   Max.   :14.00   Max.   :9.000  
##     color                 x       
##  Length:6308        Min.   :   1  
##  Class :character   1st Qu.:1658  
##  Mode  :character   Median :3258  
##                     Mean   :3261  
##                     3rd Qu.:4874  
##                     Max.   :6497
hist(df$quality)

summary(df$alcohol)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.00    9.50   10.30   10.51   11.30   14.00
which(is.na(df))
## integer(0)
sum(is.na(df))
## [1] 0
df <- na.omit(df)
summary(df)
##  fixed_acidity    volatile_acidity    citric_acid    residual_sugar  
##  Min.   : 3.800   Min.   :   0.080   Min.   :0.000   Min.   : 0.600  
##  1st Qu.: 6.400   1st Qu.:   0.230   1st Qu.:0.240   1st Qu.: 1.800  
##  Median : 7.000   Median :   0.290   Median :0.310   Median : 2.900  
##  Mean   : 7.162   Mean   :   1.347   Mean   :0.316   Mean   : 5.303  
##  3rd Qu.: 7.600   3rd Qu.:   0.400   3rd Qu.:0.390   3rd Qu.: 8.000  
##  Max.   :14.200   Max.   :1185.000   Max.   :1.660   Max.   :22.600  
##    chlorides       free sulfur dioxide total sulfur dioxide    density      
##  Min.   :0.00900   Min.   :  1.00      Min.   :  6.0        Min.   :0.9871  
##  1st Qu.:0.03800   1st Qu.: 17.00      1st Qu.: 78.0        1st Qu.:0.9923  
##  Median :0.04700   Median : 29.00      Median :118.0        Median :0.9948  
##  Mean   :0.05566   Mean   : 30.48      Mean   :115.6        Mean   :0.9946  
##  3rd Qu.:0.06400   3rd Qu.: 41.00      3rd Qu.:155.0        3rd Qu.:0.9968  
##  Max.   :0.61100   Max.   :289.00      Max.   :440.0        Max.   :1.0010  
##        pH         sulphates         alcohol         quality     
##  Min.   :2.72   Min.   :0.2200   Min.   : 8.00   Min.   :3.000  
##  1st Qu.:3.11   1st Qu.:0.4300   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.21   Median :0.5000   Median :10.30   Median :6.000  
##  Mean   :3.22   Mean   :0.5287   Mean   :10.51   Mean   :5.823  
##  3rd Qu.:3.32   3rd Qu.:0.6000   3rd Qu.:11.30   3rd Qu.:6.000  
##  Max.   :4.01   Max.   :2.0000   Max.   :14.00   Max.   :9.000  
##     color                 x       
##  Length:6308        Min.   :   1  
##  Class :character   1st Qu.:1658  
##  Mode  :character   Median :3258  
##                     Mean   :3261  
##                     3rd Qu.:4874  
##                     Max.   :6497
which(is.na(df))
## integer(0)
sum(is.na(df))
## [1] 0
summary(df$alcohol)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.00    9.50   10.30   10.51   11.30   14.00

Exploratory Data Analysis (EDA) and Data Pre-processing

summary(df)
##  fixed_acidity    volatile_acidity    citric_acid    residual_sugar  
##  Min.   : 3.800   Min.   :   0.080   Min.   :0.000   Min.   : 0.600  
##  1st Qu.: 6.400   1st Qu.:   0.230   1st Qu.:0.240   1st Qu.: 1.800  
##  Median : 7.000   Median :   0.290   Median :0.310   Median : 2.900  
##  Mean   : 7.162   Mean   :   1.347   Mean   :0.316   Mean   : 5.303  
##  3rd Qu.: 7.600   3rd Qu.:   0.400   3rd Qu.:0.390   3rd Qu.: 8.000  
##  Max.   :14.200   Max.   :1185.000   Max.   :1.660   Max.   :22.600  
##    chlorides       free sulfur dioxide total sulfur dioxide    density      
##  Min.   :0.00900   Min.   :  1.00      Min.   :  6.0        Min.   :0.9871  
##  1st Qu.:0.03800   1st Qu.: 17.00      1st Qu.: 78.0        1st Qu.:0.9923  
##  Median :0.04700   Median : 29.00      Median :118.0        Median :0.9948  
##  Mean   :0.05566   Mean   : 30.48      Mean   :115.6        Mean   :0.9946  
##  3rd Qu.:0.06400   3rd Qu.: 41.00      3rd Qu.:155.0        3rd Qu.:0.9968  
##  Max.   :0.61100   Max.   :289.00      Max.   :440.0        Max.   :1.0010  
##        pH         sulphates         alcohol         quality     
##  Min.   :2.72   Min.   :0.2200   Min.   : 8.00   Min.   :3.000  
##  1st Qu.:3.11   1st Qu.:0.4300   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.21   Median :0.5000   Median :10.30   Median :6.000  
##  Mean   :3.22   Mean   :0.5287   Mean   :10.51   Mean   :5.823  
##  3rd Qu.:3.32   3rd Qu.:0.6000   3rd Qu.:11.30   3rd Qu.:6.000  
##  Max.   :4.01   Max.   :2.0000   Max.   :14.00   Max.   :9.000  
##     color                 x       
##  Length:6308        Min.   :   1  
##  Class :character   1st Qu.:1658  
##  Mode  :character   Median :3258  
##                     Mean   :3261  
##                     3rd Qu.:4874  
##                     Max.   :6497

Observations from the Summary Mean residual sugar level is 5.4 g/l. Mean free sulfur dioxide is 30.5 ppm. Max value is 289 which is quite high as 75% is 41 ppm. PH of wine is within range from 2.7 till 4, mean 3.2.

Alcohol: lightest wine is 8%.

Minimum quality mark is 3, mean 5.8, highest is 9.

cor(x=df[,1:12], y=df$quality)
##                             [,1]
## fixed_acidity        -0.07696280
## volatile_acidity     -0.05507775
## citric_acid           0.09126767
## residual_sugar       -0.03114074
## chlorides            -0.19962943
## free sulfur dioxide   0.05655009
## total sulfur dioxide -0.03805021
## density              -0.31937078
## pH                    0.01851999
## sulphates             0.03809322
## alcohol               0.44917292
## quality               1.00000000
#sulphates:0,2514
#citric acid : 0.2264
#total sulfur dioxide : -0.18
#alcolhol:0,0226
summary(df$quality)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.823   6.000   9.000
table(df$ quality)
## 
##    3    4    5    6    7    8    9 
##   28  213 2059 2759 1053  191    5

install.packages(“ggplot2”)

library(ggplot2)

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.3
qplot(quality, data = df, fill = color, binwidth = 1) +
    scale_x_continuous(breaks = seq(3,10,1), lim = c(3,10)) 
## Warning: Removed 4 rows containing missing values (geom_bar).

summary(df$alcohol)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.00    9.50   10.30   10.51   11.30   14.00
qplot(alcohol, data=df, fill =color, binwidth = 0.5)+ scale_x_continuous(breaks = seq(8,15,0.5), lim = c(8,15))
## Warning: Removed 4 rows containing missing values (geom_bar).

qplot(quality, data =df, binwidth = 1, color = color, geom = "density") + 
    scale_x_continuous(breaks = seq(3, 9, 1))
## Warning: Ignoring unknown parameters: binwidth

install.packages(“caTools”)

library(caTools)
## Warning: package 'caTools' was built under R version 4.1.3
set.seed(150)
split = sample.split(df$quality, SplitRatio = 0.8)