library(ggplot2) library(GGally)

library(readxl)
wine_red <- read_excel("C:/Users/PC/Desktop/ads542-project/wine-red.xlsx")
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in G1297 / R1297C7: '77.5'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in G1298 / R1298C7: '77.5'
View(wine_red)
library(readxl)
wine_white <- read_excel("C:/Users/PC/Desktop/ads542-project/wine-white.xlsx")
View(wine_white)
red <- wine_red
white <- wine_white
# add categorical varialbles to both sets
red['color'] <- 'red'
white['color'] <- 'white'
# merge red wine and white wine datasets
data <- rbind(red, white)
head(data)
## # A tibble: 6 x 13
##   `fixed acidity` `volatile acidity` `citric acid` `residual sugar` chlorides
##   <chr>           <chr>              <chr>         <chr>            <chr>    
## 1 7.4             0.7                0             1.9              0.076    
## 2 7.8             0.88               0             2.6              0.098    
## 3 7.8             0.76               0.04          2.3              0.092    
## 4 11.2            0.28               0.56          1.9              0.075    
## 5 7.4             0.7                0             1.9              0.076    
## 6 7.4             0.66               0             1.8              0.075    
## # ... with 8 more variables: free sulfur dioxide <chr>,
## #   total sulfur dioxide <chr>, density <chr>, pH <chr>, sulphates <chr>,
## #   alcohol <chr>, quality <dbl>, color <chr>
tail(data)
## # A tibble: 6 x 13
##   `fixed acidity` `volatile acidity` `citric acid` `residual sugar` chlorides
##   <chr>           <chr>              <chr>         <chr>            <chr>    
## 1 6.5             0.23               0.38          1.3              0.032    
## 2 6.2             0.21               0.29          1.6              0.039    
## 3 6.6             0.32               0.36          8                0.047    
## 4 6.5             0.24               0.19          1.2              0.041    
## 5 5.5             0.29               0.3           1.1              0.022    
## 6 6               0.21               0.38          0.8              0.02     
## # ... with 8 more variables: free sulfur dioxide <chr>,
## #   total sulfur dioxide <chr>, density <chr>, pH <chr>, sulphates <chr>,
## #   alcohol <chr>, quality <dbl>, color <chr>
dim(data)
## [1] 6497   13
names(data)
##  [1] "fixed acidity"        "volatile acidity"     "citric acid"         
##  [4] "residual sugar"       "chlorides"            "free sulfur dioxide" 
##  [7] "total sulfur dioxide" "density"              "pH"                  
## [10] "sulphates"            "alcohol"              "quality"             
## [13] "color"
names(data) <- c('fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH','sulphates','alcohol','quality','color')
summary(data)
##  fixed_acidity      volatile_acidity   citric_acid        residual_sugar    
##  Length:6497        Length:6497        Length:6497        Length:6497       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   chlorides         free sulfur dioxide total sulfur dioxide   density         
##  Length:6497        Length:6497         Length:6497          Length:6497       
##  Class :character   Class :character    Class :character     Class :character  
##  Mode  :character   Mode  :character    Mode  :character     Mode  :character  
##                                                                                
##                                                                                
##                                                                                
##       pH             sulphates           alcohol             quality     
##  Length:6497        Length:6497        Length:6497        Min.   :3.000  
##  Class :character   Class :character   Class :character   1st Qu.:5.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :6.000  
##                                                           Mean   :5.818  
##                                                           3rd Qu.:6.000  
##                                                           Max.   :9.000  
##     color          
##  Length:6497       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
str(data)
## tibble [6,497 x 13] (S3: tbl_df/tbl/data.frame)
##  $ fixed_acidity       : chr [1:6497] "7.4" "7.8" "7.8" "11.2" ...
##  $ volatile_acidity    : chr [1:6497] "0.7" "0.88" "0.76" "0.28" ...
##  $ citric_acid         : chr [1:6497] "0" "0" "0.04" "0.56" ...
##  $ residual_sugar      : chr [1:6497] "1.9" "2.6" "2.3" "1.9" ...
##  $ chlorides           : chr [1:6497] "0.076" "0.098" "0.092" "0.075" ...
##  $ free sulfur dioxide : chr [1:6497] "11" "25" "15" "17" ...
##  $ total sulfur dioxide: chr [1:6497] "34" "67" "54" "60" ...
##  $ density             : chr [1:6497] "0.9978" "0.9968" "0.997" "0.998" ...
##  $ pH                  : chr [1:6497] "3.51" "3.2" "3.26" "3.16" ...
##  $ sulphates           : chr [1:6497] "0.56" "0.68" "0.65" "0.58" ...
##  $ alcohol             : chr [1:6497] "9.4" "9.8" "9.8" "9.8" ...
##  $ quality             : num [1:6497] 5 5 5 6 5 5 5 7 7 5 ...
##  $ color               : chr [1:6497] "red" "red" "red" "red" ...
data$x <- 1:nrow(data)
data
## # A tibble: 6,497 x 14
##    fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
##    <chr>         <chr>            <chr>       <chr>          <chr>    
##  1 7.4           0.7              0           1.9            0.076    
##  2 7.8           0.88             0           2.6            0.098    
##  3 7.8           0.76             0.04        2.3            0.092    
##  4 11.2          0.28             0.56        1.9            0.075    
##  5 7.4           0.7              0           1.9            0.076    
##  6 7.4           0.66             0           1.8            0.075    
##  7 7.9           0.6              0.06        1.6            0.069    
##  8 7.3           0.65             0           1.2            0.065    
##  9 7.8           0.58             0.02        2              0.073    
## 10 7.5           0.5              0.36        6.1            0.071    
## # ... with 6,487 more rows, and 9 more variables: free sulfur dioxide <chr>,
## #   total sulfur dioxide <chr>, density <chr>, pH <chr>, sulphates <chr>,
## #   alcohol <chr>, quality <dbl>, color <chr>, x <int>
str(data)
## tibble [6,497 x 14] (S3: tbl_df/tbl/data.frame)
##  $ fixed_acidity       : chr [1:6497] "7.4" "7.8" "7.8" "11.2" ...
##  $ volatile_acidity    : chr [1:6497] "0.7" "0.88" "0.76" "0.28" ...
##  $ citric_acid         : chr [1:6497] "0" "0" "0.04" "0.56" ...
##  $ residual_sugar      : chr [1:6497] "1.9" "2.6" "2.3" "1.9" ...
##  $ chlorides           : chr [1:6497] "0.076" "0.098" "0.092" "0.075" ...
##  $ free sulfur dioxide : chr [1:6497] "11" "25" "15" "17" ...
##  $ total sulfur dioxide: chr [1:6497] "34" "67" "54" "60" ...
##  $ density             : chr [1:6497] "0.9978" "0.9968" "0.997" "0.998" ...
##  $ pH                  : chr [1:6497] "3.51" "3.2" "3.26" "3.16" ...
##  $ sulphates           : chr [1:6497] "0.56" "0.68" "0.65" "0.58" ...
##  $ alcohol             : chr [1:6497] "9.4" "9.8" "9.8" "9.8" ...
##  $ quality             : num [1:6497] 5 5 5 6 5 5 5 7 7 5 ...
##  $ color               : chr [1:6497] "red" "red" "red" "red" ...
##  $ x                   : int [1:6497] 1 2 3 4 5 6 7 8 9 10 ...
data$fixed_acidity <- as.numeric(as.character(data$fixed_acidity))

data$volatile_acidity <- as.numeric(as.character(data$volatile_acidity))

data$citric_acid <- as.numeric(as.character(data$citric_acid))

data$residual_sugar <- as.numeric(as.character(data$residual_sugar))

data$chlorides <- as.numeric(as.character(data$chlorides))

data$`free sulfur dioxide` <- as.numeric(as.character(data$`free sulfur dioxide`))


data$`total sulfur dioxide` <- as.numeric(as.character(data$`total sulfur dioxide`))


data$density <- as.numeric(as.character(data$density))

data$pH <- as.numeric(as.character(data$pH))

data$sulphates <- as.numeric(as.character(data$sulphates))

data$alcohol <- as.numeric(as.character(data$alcohol))


data$quality <- as.integer(as.character(data$quality))

Explaratory Data Aanalysis (EDA) and Pre-Processing

summary(data)
##  fixed_acidity    volatile_acidity    citric_acid     residual_sugar  
##  Min.   : 3.800   Min.   :   0.080   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.400   1st Qu.:   0.230   1st Qu.:0.2500   1st Qu.: 1.800  
##  Median : 7.000   Median :   0.290   Median :0.3100   Median : 3.000  
##  Mean   : 7.215   Mean   :   1.319   Mean   :0.3186   Mean   : 5.443  
##  3rd Qu.: 7.700   3rd Qu.:   0.400   3rd Qu.:0.3900   3rd Qu.: 8.100  
##  Max.   :15.900   Max.   :1185.000   Max.   :1.6600   Max.   :65.800  
##    chlorides       free sulfur dioxide total sulfur dioxide    density         
##  Min.   :0.00900   Min.   :  1.00      Min.   :  6.0        Min.   :     0.99  
##  1st Qu.:0.03800   1st Qu.: 17.00      1st Qu.: 77.0        1st Qu.:     0.99  
##  Median :0.04700   Median : 29.00      Median :118.0        Median :     0.99  
##  Mean   :0.05603   Mean   : 30.53      Mean   :115.7        Mean   :   739.81  
##  3rd Qu.:0.06500   3rd Qu.: 41.00      3rd Qu.:156.0        3rd Qu.:     1.00  
##  Max.   :0.61100   Max.   :289.00      Max.   :440.0        Max.   :103898.00  
##        pH          sulphates         alcohol             quality     
##  Min.   :2.720   Min.   :0.2200   Min.   :8.000e+00   Min.   :3.000  
##  1st Qu.:3.110   1st Qu.:0.4300   1st Qu.:1.000e+01   1st Qu.:5.000  
##  Median :3.210   Median :0.5100   Median :1.000e+01   Median :6.000  
##  Mean   :3.219   Mean   :0.5313   Mean   :1.733e+12   Mean   :5.818  
##  3rd Qu.:3.320   3rd Qu.:0.6000   3rd Qu.:1.100e+01   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :9.733e+14   Max.   :9.000  
##     color                 x       
##  Length:6497        Min.   :   1  
##  Class :character   1st Qu.:1625  
##  Mode  :character   Median :3249  
##                     Mean   :3249  
##                     3rd Qu.:4873  
##                     Max.   :6497
hist(data$quality)

summary(data$alcohol)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 8.000e+00 1.000e+01 1.000e+01 1.733e+12 1.100e+01 9.733e+14
which(is.na(data))
## integer(0)
sum(is.na(data))
## [1] 0
df <- na.omit(data)
summary(df)
##  fixed_acidity    volatile_acidity    citric_acid     residual_sugar  
##  Min.   : 3.800   Min.   :   0.080   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.400   1st Qu.:   0.230   1st Qu.:0.2500   1st Qu.: 1.800  
##  Median : 7.000   Median :   0.290   Median :0.3100   Median : 3.000  
##  Mean   : 7.215   Mean   :   1.319   Mean   :0.3186   Mean   : 5.443  
##  3rd Qu.: 7.700   3rd Qu.:   0.400   3rd Qu.:0.3900   3rd Qu.: 8.100  
##  Max.   :15.900   Max.   :1185.000   Max.   :1.6600   Max.   :65.800  
##    chlorides       free sulfur dioxide total sulfur dioxide    density         
##  Min.   :0.00900   Min.   :  1.00      Min.   :  6.0        Min.   :     0.99  
##  1st Qu.:0.03800   1st Qu.: 17.00      1st Qu.: 77.0        1st Qu.:     0.99  
##  Median :0.04700   Median : 29.00      Median :118.0        Median :     0.99  
##  Mean   :0.05603   Mean   : 30.53      Mean   :115.7        Mean   :   739.81  
##  3rd Qu.:0.06500   3rd Qu.: 41.00      3rd Qu.:156.0        3rd Qu.:     1.00  
##  Max.   :0.61100   Max.   :289.00      Max.   :440.0        Max.   :103898.00  
##        pH          sulphates         alcohol             quality     
##  Min.   :2.720   Min.   :0.2200   Min.   :8.000e+00   Min.   :3.000  
##  1st Qu.:3.110   1st Qu.:0.4300   1st Qu.:1.000e+01   1st Qu.:5.000  
##  Median :3.210   Median :0.5100   Median :1.000e+01   Median :6.000  
##  Mean   :3.219   Mean   :0.5313   Mean   :1.733e+12   Mean   :5.818  
##  3rd Qu.:3.320   3rd Qu.:0.6000   3rd Qu.:1.100e+01   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :9.733e+14   Max.   :9.000  
##     color                 x       
##  Length:6497        Min.   :   1  
##  Class :character   1st Qu.:1625  
##  Mode  :character   Median :3249  
##                     Mean   :3249  
##                     3rd Qu.:4873  
##                     Max.   :6497
which(is.na(data))
## integer(0)
sum(is.na(df))
## [1] 0
summary(df$alcohol)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 8.000e+00 1.000e+01 1.000e+01 1.733e+12 1.100e+01 9.733e+14

Exploratory Data Analysis (EDA) and Data Pre-processing

summary(df)
##  fixed_acidity    volatile_acidity    citric_acid     residual_sugar  
##  Min.   : 3.800   Min.   :   0.080   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.400   1st Qu.:   0.230   1st Qu.:0.2500   1st Qu.: 1.800  
##  Median : 7.000   Median :   0.290   Median :0.3100   Median : 3.000  
##  Mean   : 7.215   Mean   :   1.319   Mean   :0.3186   Mean   : 5.443  
##  3rd Qu.: 7.700   3rd Qu.:   0.400   3rd Qu.:0.3900   3rd Qu.: 8.100  
##  Max.   :15.900   Max.   :1185.000   Max.   :1.6600   Max.   :65.800  
##    chlorides       free sulfur dioxide total sulfur dioxide    density         
##  Min.   :0.00900   Min.   :  1.00      Min.   :  6.0        Min.   :     0.99  
##  1st Qu.:0.03800   1st Qu.: 17.00      1st Qu.: 77.0        1st Qu.:     0.99  
##  Median :0.04700   Median : 29.00      Median :118.0        Median :     0.99  
##  Mean   :0.05603   Mean   : 30.53      Mean   :115.7        Mean   :   739.81  
##  3rd Qu.:0.06500   3rd Qu.: 41.00      3rd Qu.:156.0        3rd Qu.:     1.00  
##  Max.   :0.61100   Max.   :289.00      Max.   :440.0        Max.   :103898.00  
##        pH          sulphates         alcohol             quality     
##  Min.   :2.720   Min.   :0.2200   Min.   :8.000e+00   Min.   :3.000  
##  1st Qu.:3.110   1st Qu.:0.4300   1st Qu.:1.000e+01   1st Qu.:5.000  
##  Median :3.210   Median :0.5100   Median :1.000e+01   Median :6.000  
##  Mean   :3.219   Mean   :0.5313   Mean   :1.733e+12   Mean   :5.818  
##  3rd Qu.:3.320   3rd Qu.:0.6000   3rd Qu.:1.100e+01   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :9.733e+14   Max.   :9.000  
##     color                 x       
##  Length:6497        Min.   :   1  
##  Class :character   1st Qu.:1625  
##  Mode  :character   Median :3249  
##                     Mean   :3249  
##                     3rd Qu.:4873  
##                     Max.   :6497

Observations from the Summary Mean residual sugar level is 5.4 g/l. Mean free sulfur dioxide is 30.5 ppm. Max value is 289 which is quite high as 75% is 41 ppm. PH of wine is within range from 2.7 till 4, mean 3.2.

Alcohol: lightest wine is 8%.

Minimum quality mark is 3, mean 5.8, highest is 9.

cor(x=data[,1:12], y=data$quality)
##                              [,1]
## fixed_acidity        -0.076743208
## volatile_acidity     -0.054250827
## citric_acid           0.085531717
## residual_sugar       -0.036980485
## chlorides            -0.200665500
## free sulfur dioxide   0.055463059
## total sulfur dioxide -0.041385454
## density              -0.029857295
## pH                    0.019505704
## sulphates             0.038485446
## alcohol              -0.006183752
## quality               1.000000000
#sulphates:0,2514
#citric acid : 0.2264
#total sulfur dioxide : -0.18
#alcolhol:0,0226
summary(data$quality)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.818   6.000   9.000
table(data$ quality)
## 
##    3    4    5    6    7    8    9 
##   30  216 2138 2836 1079  193    5

install.packages(“ggplot2”)

library(ggplot2)

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.3
qplot(quality, data = data, fill = color, binwidth = 1) +
    scale_x_continuous(breaks = seq(3,10,1), lim = c(3,10)) 
## Warning: Removed 4 rows containing missing values (geom_bar).