library(ggplot2) library(GGally)
library(readxl)
wine_red <- read_excel("C:/Users/PC/Desktop/ads542-project/wine-red.xlsx")
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in G1297 / R1297C7: '77.5'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in G1298 / R1298C7: '77.5'
View(wine_red)
library(readxl)
wine_white <- read_excel("C:/Users/PC/Desktop/ads542-project/wine-white.xlsx")
View(wine_white)
red <- wine_red
white <- wine_white
# add categorical varialbles to both sets
red['color'] <- 'red'
white['color'] <- 'white'
# merge red wine and white wine datasets
data <- rbind(red, white)
head(data)
## # A tibble: 6 x 13
## `fixed acidity` `volatile acidity` `citric acid` `residual sugar` chlorides
## <chr> <chr> <chr> <chr> <chr>
## 1 7.4 0.7 0 1.9 0.076
## 2 7.8 0.88 0 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.7 0 1.9 0.076
## 6 7.4 0.66 0 1.8 0.075
## # ... with 8 more variables: free sulfur dioxide <chr>,
## # total sulfur dioxide <chr>, density <chr>, pH <chr>, sulphates <chr>,
## # alcohol <chr>, quality <dbl>, color <chr>
tail(data)
## # A tibble: 6 x 13
## `fixed acidity` `volatile acidity` `citric acid` `residual sugar` chlorides
## <chr> <chr> <chr> <chr> <chr>
## 1 6.5 0.23 0.38 1.3 0.032
## 2 6.2 0.21 0.29 1.6 0.039
## 3 6.6 0.32 0.36 8 0.047
## 4 6.5 0.24 0.19 1.2 0.041
## 5 5.5 0.29 0.3 1.1 0.022
## 6 6 0.21 0.38 0.8 0.02
## # ... with 8 more variables: free sulfur dioxide <chr>,
## # total sulfur dioxide <chr>, density <chr>, pH <chr>, sulphates <chr>,
## # alcohol <chr>, quality <dbl>, color <chr>
dim(data)
## [1] 6497 13
names(data)
## [1] "fixed acidity" "volatile acidity" "citric acid"
## [4] "residual sugar" "chlorides" "free sulfur dioxide"
## [7] "total sulfur dioxide" "density" "pH"
## [10] "sulphates" "alcohol" "quality"
## [13] "color"
names(data) <- c('fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH','sulphates','alcohol','quality','color')
summary(data)
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Length:6497 Length:6497 Length:6497 Length:6497
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## chlorides free sulfur dioxide total sulfur dioxide density
## Length:6497 Length:6497 Length:6497 Length:6497
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## pH sulphates alcohol quality
## Length:6497 Length:6497 Length:6497 Min. :3.000
## Class :character Class :character Class :character 1st Qu.:5.000
## Mode :character Mode :character Mode :character Median :6.000
## Mean :5.818
## 3rd Qu.:6.000
## Max. :9.000
## color
## Length:6497
## Class :character
## Mode :character
##
##
##
str(data)
## tibble [6,497 x 13] (S3: tbl_df/tbl/data.frame)
## $ fixed_acidity : chr [1:6497] "7.4" "7.8" "7.8" "11.2" ...
## $ volatile_acidity : chr [1:6497] "0.7" "0.88" "0.76" "0.28" ...
## $ citric_acid : chr [1:6497] "0" "0" "0.04" "0.56" ...
## $ residual_sugar : chr [1:6497] "1.9" "2.6" "2.3" "1.9" ...
## $ chlorides : chr [1:6497] "0.076" "0.098" "0.092" "0.075" ...
## $ free sulfur dioxide : chr [1:6497] "11" "25" "15" "17" ...
## $ total sulfur dioxide: chr [1:6497] "34" "67" "54" "60" ...
## $ density : chr [1:6497] "0.9978" "0.9968" "0.997" "0.998" ...
## $ pH : chr [1:6497] "3.51" "3.2" "3.26" "3.16" ...
## $ sulphates : chr [1:6497] "0.56" "0.68" "0.65" "0.58" ...
## $ alcohol : chr [1:6497] "9.4" "9.8" "9.8" "9.8" ...
## $ quality : num [1:6497] 5 5 5 6 5 5 5 7 7 5 ...
## $ color : chr [1:6497] "red" "red" "red" "red" ...
data$x <- 1:nrow(data)
data
## # A tibble: 6,497 x 14
## fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
## <chr> <chr> <chr> <chr> <chr>
## 1 7.4 0.7 0 1.9 0.076
## 2 7.8 0.88 0 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.7 0 1.9 0.076
## 6 7.4 0.66 0 1.8 0.075
## 7 7.9 0.6 0.06 1.6 0.069
## 8 7.3 0.65 0 1.2 0.065
## 9 7.8 0.58 0.02 2 0.073
## 10 7.5 0.5 0.36 6.1 0.071
## # ... with 6,487 more rows, and 9 more variables: free sulfur dioxide <chr>,
## # total sulfur dioxide <chr>, density <chr>, pH <chr>, sulphates <chr>,
## # alcohol <chr>, quality <dbl>, color <chr>, x <int>
str(data)
## tibble [6,497 x 14] (S3: tbl_df/tbl/data.frame)
## $ fixed_acidity : chr [1:6497] "7.4" "7.8" "7.8" "11.2" ...
## $ volatile_acidity : chr [1:6497] "0.7" "0.88" "0.76" "0.28" ...
## $ citric_acid : chr [1:6497] "0" "0" "0.04" "0.56" ...
## $ residual_sugar : chr [1:6497] "1.9" "2.6" "2.3" "1.9" ...
## $ chlorides : chr [1:6497] "0.076" "0.098" "0.092" "0.075" ...
## $ free sulfur dioxide : chr [1:6497] "11" "25" "15" "17" ...
## $ total sulfur dioxide: chr [1:6497] "34" "67" "54" "60" ...
## $ density : chr [1:6497] "0.9978" "0.9968" "0.997" "0.998" ...
## $ pH : chr [1:6497] "3.51" "3.2" "3.26" "3.16" ...
## $ sulphates : chr [1:6497] "0.56" "0.68" "0.65" "0.58" ...
## $ alcohol : chr [1:6497] "9.4" "9.8" "9.8" "9.8" ...
## $ quality : num [1:6497] 5 5 5 6 5 5 5 7 7 5 ...
## $ color : chr [1:6497] "red" "red" "red" "red" ...
## $ x : int [1:6497] 1 2 3 4 5 6 7 8 9 10 ...
data$fixed_acidity <- as.numeric(as.character(data$fixed_acidity))
data$volatile_acidity <- as.numeric(as.character(data$volatile_acidity))
data$citric_acid <- as.numeric(as.character(data$citric_acid))
data$residual_sugar <- as.numeric(as.character(data$residual_sugar))
data$chlorides <- as.numeric(as.character(data$chlorides))
data$`free sulfur dioxide` <- as.numeric(as.character(data$`free sulfur dioxide`))
data$`total sulfur dioxide` <- as.numeric(as.character(data$`total sulfur dioxide`))
data$density <- as.numeric(as.character(data$density))
data$pH <- as.numeric(as.character(data$pH))
data$sulphates <- as.numeric(as.character(data$sulphates))
data$alcohol <- as.numeric(as.character(data$alcohol))
data$quality <- as.integer(as.character(data$quality))
summary(data)
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 3.800 Min. : 0.080 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.: 0.230 1st Qu.:0.2500 1st Qu.: 1.800
## Median : 7.000 Median : 0.290 Median :0.3100 Median : 3.000
## Mean : 7.215 Mean : 1.319 Mean :0.3186 Mean : 5.443
## 3rd Qu.: 7.700 3rd Qu.: 0.400 3rd Qu.:0.3900 3rd Qu.: 8.100
## Max. :15.900 Max. :1185.000 Max. :1.6600 Max. :65.800
## chlorides free sulfur dioxide total sulfur dioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. : 0.99
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 77.0 1st Qu.: 0.99
## Median :0.04700 Median : 29.00 Median :118.0 Median : 0.99
## Mean :0.05603 Mean : 30.53 Mean :115.7 Mean : 739.81
## 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:156.0 3rd Qu.: 1.00
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :103898.00
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. :8.000e+00 Min. :3.000
## 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.:1.000e+01 1st Qu.:5.000
## Median :3.210 Median :0.5100 Median :1.000e+01 Median :6.000
## Mean :3.219 Mean :0.5313 Mean :1.733e+12 Mean :5.818
## 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:1.100e+01 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :9.733e+14 Max. :9.000
## color x
## Length:6497 Min. : 1
## Class :character 1st Qu.:1625
## Mode :character Median :3249
## Mean :3249
## 3rd Qu.:4873
## Max. :6497
hist(data$quality)
summary(data$alcohol)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.000e+00 1.000e+01 1.000e+01 1.733e+12 1.100e+01 9.733e+14
which(is.na(data))
## integer(0)
sum(is.na(data))
## [1] 0
df <- na.omit(data)
summary(df)
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 3.800 Min. : 0.080 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.: 0.230 1st Qu.:0.2500 1st Qu.: 1.800
## Median : 7.000 Median : 0.290 Median :0.3100 Median : 3.000
## Mean : 7.215 Mean : 1.319 Mean :0.3186 Mean : 5.443
## 3rd Qu.: 7.700 3rd Qu.: 0.400 3rd Qu.:0.3900 3rd Qu.: 8.100
## Max. :15.900 Max. :1185.000 Max. :1.6600 Max. :65.800
## chlorides free sulfur dioxide total sulfur dioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. : 0.99
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 77.0 1st Qu.: 0.99
## Median :0.04700 Median : 29.00 Median :118.0 Median : 0.99
## Mean :0.05603 Mean : 30.53 Mean :115.7 Mean : 739.81
## 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:156.0 3rd Qu.: 1.00
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :103898.00
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. :8.000e+00 Min. :3.000
## 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.:1.000e+01 1st Qu.:5.000
## Median :3.210 Median :0.5100 Median :1.000e+01 Median :6.000
## Mean :3.219 Mean :0.5313 Mean :1.733e+12 Mean :5.818
## 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:1.100e+01 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :9.733e+14 Max. :9.000
## color x
## Length:6497 Min. : 1
## Class :character 1st Qu.:1625
## Mode :character Median :3249
## Mean :3249
## 3rd Qu.:4873
## Max. :6497
which(is.na(data))
## integer(0)
sum(is.na(df))
## [1] 0
summary(df$alcohol)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.000e+00 1.000e+01 1.000e+01 1.733e+12 1.100e+01 9.733e+14
summary(df)
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 3.800 Min. : 0.080 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.: 0.230 1st Qu.:0.2500 1st Qu.: 1.800
## Median : 7.000 Median : 0.290 Median :0.3100 Median : 3.000
## Mean : 7.215 Mean : 1.319 Mean :0.3186 Mean : 5.443
## 3rd Qu.: 7.700 3rd Qu.: 0.400 3rd Qu.:0.3900 3rd Qu.: 8.100
## Max. :15.900 Max. :1185.000 Max. :1.6600 Max. :65.800
## chlorides free sulfur dioxide total sulfur dioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. : 0.99
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 77.0 1st Qu.: 0.99
## Median :0.04700 Median : 29.00 Median :118.0 Median : 0.99
## Mean :0.05603 Mean : 30.53 Mean :115.7 Mean : 739.81
## 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:156.0 3rd Qu.: 1.00
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :103898.00
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. :8.000e+00 Min. :3.000
## 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.:1.000e+01 1st Qu.:5.000
## Median :3.210 Median :0.5100 Median :1.000e+01 Median :6.000
## Mean :3.219 Mean :0.5313 Mean :1.733e+12 Mean :5.818
## 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:1.100e+01 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :9.733e+14 Max. :9.000
## color x
## Length:6497 Min. : 1
## Class :character 1st Qu.:1625
## Mode :character Median :3249
## Mean :3249
## 3rd Qu.:4873
## Max. :6497
Observations from the Summary Mean residual sugar level is 5.4 g/l. Mean free sulfur dioxide is 30.5 ppm. Max value is 289 which is quite high as 75% is 41 ppm. PH of wine is within range from 2.7 till 4, mean 3.2.
Alcohol: lightest wine is 8%.
Minimum quality mark is 3, mean 5.8, highest is 9.
cor(x=data[,1:12], y=data$quality)
## [,1]
## fixed_acidity -0.076743208
## volatile_acidity -0.054250827
## citric_acid 0.085531717
## residual_sugar -0.036980485
## chlorides -0.200665500
## free sulfur dioxide 0.055463059
## total sulfur dioxide -0.041385454
## density -0.029857295
## pH 0.019505704
## sulphates 0.038485446
## alcohol -0.006183752
## quality 1.000000000
#sulphates:0,2514
#citric acid : 0.2264
#total sulfur dioxide : -0.18
#alcolhol:0,0226
summary(data$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.818 6.000 9.000
table(data$ quality)
##
## 3 4 5 6 7 8 9
## 30 216 2138 2836 1079 193 5
install.packages(“ggplot2”)
library(ggplot2)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.3
qplot(quality, data = data, fill = color, binwidth = 1) +
scale_x_continuous(breaks = seq(3,10,1), lim = c(3,10))
## Warning: Removed 4 rows containing missing values (geom_bar).