library(ggplot2) library(GGally)
library(readxl)
wine_red <- read_excel("C:/Users/PC/Desktop/ads542-project/wine-red2.xlsx")
View(wine_red)
library(readxl)
wine_white <- read_excel("C:/Users/PC/Desktop/ads542-project/wine-white2.xlsx")
View(wine_white)
red <- wine_red
white <- wine_white
# add categorical varialbles to both sets
red['color'] <- 'red'
white['color'] <- 'white'
# merge red wine and white wine datasets
data <- rbind(red, white)
head(data)
## # A tibble: 6 x 13
## `fixed acidity` `volatile acidity` `citric acid` `residual sugar` chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.4 0.7 0 1.9 0.076
## 2 7.8 0.88 0 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.7 0 1.9 0.076
## 6 7.4 0.66 0 1.8 0.075
## # ... with 8 more variables: free sulfur dioxide <dbl>,
## # total sulfur dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>, quality <dbl>, color <chr>
tail(data)
## # A tibble: 6 x 13
## `fixed acidity` `volatile acidity` `citric acid` `residual sugar` chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6.5 0.23 0.38 1.3 0.032
## 2 6.2 0.21 0.29 1.6 0.039
## 3 6.6 0.32 0.36 8 0.047
## 4 6.5 0.24 0.19 1.2 0.041
## 5 5.5 0.29 0.3 1.1 0.022
## 6 6 0.21 0.38 0.8 0.02
## # ... with 8 more variables: free sulfur dioxide <dbl>,
## # total sulfur dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>, quality <dbl>, color <chr>
dim(data)
## [1] 6497 13
names(data)
## [1] "fixed acidity" "volatile acidity" "citric acid"
## [4] "residual sugar" "chlorides" "free sulfur dioxide"
## [7] "total sulfur dioxide" "density" "pH"
## [10] "sulphates" "alcohol" "quality"
## [13] "color"
names(data) <- c('fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH','sulphates','alcohol','quality','color')
summary(data)
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 3.800 Min. : 0.080 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.: 0.230 1st Qu.:0.2500 1st Qu.: 1.800
## Median : 7.000 Median : 0.290 Median :0.3100 Median : 3.000
## Mean : 7.215 Mean : 1.319 Mean :0.3186 Mean : 5.443
## 3rd Qu.: 7.700 3rd Qu.: 0.400 3rd Qu.:0.3900 3rd Qu.: 8.100
## Max. :15.900 Max. :1185.000 Max. :1.6600 Max. :65.800
## chlorides free sulfur dioxide total sulfur dioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. : 0.99
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 77.0 1st Qu.: 0.99
## Median :0.04700 Median : 29.00 Median :118.0 Median : 0.99
## Mean :0.05603 Mean : 30.53 Mean :115.7 Mean : 739.66
## 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:156.0 3rd Qu.: 1.00
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :103898.00
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. :8.000e+00 Min. :3.000
## 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.:1.000e+01 1st Qu.:5.000
## Median :3.210 Median :0.5100 Median :1.000e+01 Median :6.000
## Mean :3.219 Mean :0.5313 Mean :1.733e+12 Mean :5.818
## 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:1.100e+01 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :9.733e+14 Max. :9.000
## color
## Length:6497
## Class :character
## Mode :character
##
##
##
str(data)
## tibble [6,497 x 13] (S3: tbl_df/tbl/data.frame)
## $ fixed_acidity : num [1:6497] 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile_acidity : num [1:6497] 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric_acid : num [1:6497] 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual_sugar : num [1:6497] 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num [1:6497] 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free sulfur dioxide : num [1:6497] 11 25 15 17 11 13 15 15 9 17 ...
## $ total sulfur dioxide: num [1:6497] 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num [1:6497] 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num [1:6497] 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num [1:6497] 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num [1:6497] 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : num [1:6497] 5 5 5 6 5 5 5 7 7 5 ...
## $ color : chr [1:6497] "red" "red" "red" "red" ...
data$x <- 1:nrow(data)
data
## # A tibble: 6,497 x 14
## fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.4 0.7 0 1.9 0.076
## 2 7.8 0.88 0 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.7 0 1.9 0.076
## 6 7.4 0.66 0 1.8 0.075
## 7 7.9 0.6 0.06 1.6 0.069
## 8 7.3 0.65 0 1.2 0.065
## 9 7.8 0.58 0.02 2 0.073
## 10 7.5 0.5 0.36 6.1 0.071
## # ... with 6,487 more rows, and 9 more variables: free sulfur dioxide <dbl>,
## # total sulfur dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>, quality <dbl>, color <chr>, x <int>
str(data)
## tibble [6,497 x 14] (S3: tbl_df/tbl/data.frame)
## $ fixed_acidity : num [1:6497] 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile_acidity : num [1:6497] 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric_acid : num [1:6497] 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual_sugar : num [1:6497] 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num [1:6497] 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free sulfur dioxide : num [1:6497] 11 25 15 17 11 13 15 15 9 17 ...
## $ total sulfur dioxide: num [1:6497] 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num [1:6497] 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num [1:6497] 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num [1:6497] 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num [1:6497] 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : num [1:6497] 5 5 5 6 5 5 5 7 7 5 ...
## $ color : chr [1:6497] "red" "red" "red" "red" ...
## $ x : int [1:6497] 1 2 3 4 5 6 7 8 9 10 ...
data$fixed_acidity <- as.numeric(as.character(data$fixed_acidity))
data$volatile_acidity <- as.numeric(as.character(data$volatile_acidity))
data$citric_acid <- as.numeric(as.character(data$citric_acid))
data$residual_sugar <- as.numeric(as.character(data$residual_sugar))
data$chlorides <- as.numeric(as.character(data$chlorides))
data$`free sulfur dioxide` <- as.numeric(as.character(data$`free sulfur dioxide`))
data$`total sulfur dioxide` <- as.numeric(as.character(data$`total sulfur dioxide`))
data$density <- as.numeric(as.character(data$density))
data$pH <- as.numeric(as.character(data$pH))
data$sulphates <- as.numeric(as.character(data$sulphates))
data$alcohol <- as.numeric(as.character(data$alcohol))
data$quality <- as.integer(as.character(data$quality))
summary(data)
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 3.800 Min. : 0.080 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.: 0.230 1st Qu.:0.2500 1st Qu.: 1.800
## Median : 7.000 Median : 0.290 Median :0.3100 Median : 3.000
## Mean : 7.215 Mean : 1.319 Mean :0.3186 Mean : 5.443
## 3rd Qu.: 7.700 3rd Qu.: 0.400 3rd Qu.:0.3900 3rd Qu.: 8.100
## Max. :15.900 Max. :1185.000 Max. :1.6600 Max. :65.800
## chlorides free sulfur dioxide total sulfur dioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. : 0.99
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 77.0 1st Qu.: 0.99
## Median :0.04700 Median : 29.00 Median :118.0 Median : 0.99
## Mean :0.05603 Mean : 30.53 Mean :115.7 Mean : 739.66
## 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:156.0 3rd Qu.: 1.00
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :103898.00
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. :8.000e+00 Min. :3.000
## 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.:1.000e+01 1st Qu.:5.000
## Median :3.210 Median :0.5100 Median :1.000e+01 Median :6.000
## Mean :3.219 Mean :0.5313 Mean :1.733e+12 Mean :5.818
## 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:1.100e+01 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :9.733e+14 Max. :9.000
## color x
## Length:6497 Min. : 1
## Class :character 1st Qu.:1625
## Mode :character Median :3249
## Mean :3249
## 3rd Qu.:4873
## Max. :6497
boxplot.stats(data$density)$out
## [1] 100005 100005 100025 100015 100015 1001 10014 10001 10015 10015
## [11] 10002 10008 10006 10004 10018 10001 1001 10022 10022 10014
## [21] 10003 10014 10014 10004 10014 10004 10004 10032 10008 10006
## [31] 10026 10002 10002 10002 1001 1001 10004 10004 10014 10008
## [41] 100315 100315 10002 100315 10002 10021 10021 10003 10002 10002
## [51] 10002 10004 10006 10006 10026 10006 10006 1001 1001 10004
## [61] 10004 10001 10002 100024 10001 100012 100289 100369 100369 100242
## [71] 100242 1001 10002 1001 1001 10002 100055 10006 10006 1001
## [81] 10002 10002 10004 10006 10003 10003 10003 10004 10001 10005
## [91] 10012 10004 10004 10024 10001 10103 10103 10004 10008 10002
## [101] 10008 10008 10007 10001 10001 10017 10017 10011 10011 10006
## [111] 10004 10004 10004 10002 10004 10001 10001 10001 10005 10001
## [121] 10001 10001 10001 100182 100047 100241 100098 100016 100051 100118
## [131] 100014 10002 100013 100013 103898 100014 100196 100037 100037 100295
## [141] 100295 100044 100044 100022 100038 100038
#Thanks to the which() function it is possible to extract the row number corresponding to these outliers:
out <- boxplot.stats(data$density)$out
out_ind <- which(data$density %in% c(out))
out_ind
## [1] 244 245 267 285 286 290 295 296 325 326 329 339 345 351 354
## [16] 357 360 365 367 375 377 382 392 396 416 434 436 443 460 466
## [31] 481 489 494 500 516 517 532 533 539 545 555 556 557 558 559
## [46] 560 565 571 581 582 585 594 602 604 609 612 619 634 652 656
## [61] 681 699 738 744 745 812 890 1435 1436 1475 1477 1607 1614 1782 1791
## [76] 2317 2362 2379 2383 2643 2665 2671 2678 2690 2867 2870 2876 2932 3017 3036
## [91] 3054 3087 3180 3208 3234 3253 3263 3264 3280 3393 3407 3409 3448 3540 3542
## [106] 3558 3563 3573 3574 3585 3595 3597 3598 3599 3606 3651 3652 3656 3706 3710
## [121] 3711 3712 3715 3850 3869 3934 3978 3996 4011 4019 4034 4220 4234 4237 4381
## [136] 4862 5020 5145 5147 5219 5223 5278 5294 5330 5614 5618
Q <- quantile(data$density, probs=c(.25, .75), na.rm = FALSE)
iqr <- IQR(data$density)
up <- Q[2]+1.5*iqr # Upper Range
low<- Q[1]-1.5*iqr # Lower Range
eliminated<- subset(data,
data$density > (Q[1] - 1.5*iqr) & data$density < (Q[2]+1.5*iqr))
eliminated
## # A tibble: 6,351 x 14
## fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.4 0.7 0 1.9 0.076
## 2 7.8 0.88 0 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.7 0 1.9 0.076
## 6 7.4 0.66 0 1.8 0.075
## 7 7.9 0.6 0.06 1.6 0.069
## 8 7.3 0.65 0 1.2 0.065
## 9 7.8 0.58 0.02 2 0.073
## 10 7.5 0.5 0.36 6.1 0.071
## # ... with 6,341 more rows, and 9 more variables: free sulfur dioxide <dbl>,
## # total sulfur dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>, quality <int>, color <chr>, x <int>
summary(eliminated)
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 3.800 Min. : 0.080 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.: 0.230 1st Qu.:0.2400 1st Qu.: 1.800
## Median : 7.000 Median : 0.290 Median :0.3100 Median : 2.900
## Mean : 7.159 Mean : 1.341 Mean :0.3158 Mean : 5.301
## 3rd Qu.: 7.600 3rd Qu.: 0.400 3rd Qu.:0.3900 3rd Qu.: 8.000
## Max. :15.900 Max. :1185.000 Max. :1.6600 Max. :22.600
## chlorides free sulfur dioxide total sulfur dioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. :0.9871
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 78.0 1st Qu.:0.9923
## Median :0.04700 Median : 29.00 Median :118.0 Median :0.9948
## Mean :0.05562 Mean : 30.51 Mean :115.6 Mean :0.9945
## 3rd Qu.:0.06400 3rd Qu.: 41.00 3rd Qu.:155.0 3rd Qu.:0.9968
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :1.0010
## pH sulphates alcohol quality
## Min. :2.72 Min. :0.2200 Min. :8.000e+00 Min. :3.000
## 1st Qu.:3.11 1st Qu.:0.4300 1st Qu.:1.000e+01 1st Qu.:5.000
## Median :3.21 Median :0.5000 Median :1.000e+01 Median :6.000
## Mean :3.22 Mean :0.5289 Mean :1.773e+12 Mean :5.824
## 3rd Qu.:3.32 3rd Qu.:0.6000 3rd Qu.:1.100e+01 3rd Qu.:6.000
## Max. :4.01 Max. :2.0000 Max. :9.733e+14 Max. :9.000
## color x
## Length:6351 Min. : 1
## Class :character 1st Qu.:1662
## Mode :character Median :3274
## Mean :3275
## 3rd Qu.:4900
## Max. :6497
boxplot(eliminated$alcohol, plot = FALSE)$out
## [1] 1.490000e+01 1.003333e+14 1.003333e+14 1.106667e+14 9.566667e+14
## [6] 1.356667e+14 9.233333e+14 1.420000e+01 1.289333e+14 1.289333e+14
## [11] 1.146667e+14 1.003333e+14 1.143333e+14 1.053333e+14 9.533333e+14
## [16] 1.093333e+14 1.093333e+14 1.136667e+14 1.133333e+14 1.106667e+14
## [21] 1.133333e+14 9.733333e+14 9.733333e+14 1.133333e+14 1.405000e+01
## [26] 1.233333e+14 1.126667e+14 1.056667e+14 1.173333e+14 1.096667e+14
## [31] 1.096667e+14 1.096667e+14 1.013333e+14 1.013333e+14 1.046667e+14
## [36] 1.046667e+14 1.163333e+14 1.163333e+14 1.313333e+14 1.206667e+14
## [41] 9.633333e+14 9.533333e+14 9.533333e+14
outliers1 <- boxplot(eliminated$alcohol, plot=FALSE)$out
df <- eliminated
df <- df[-which(df$alcohol %in% outliers1),]
summary(df)
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 3.800 Min. : 0.080 Min. :0.000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.: 0.230 1st Qu.:0.240 1st Qu.: 1.800
## Median : 7.000 Median : 0.290 Median :0.310 Median : 2.900
## Mean : 7.162 Mean : 1.347 Mean :0.316 Mean : 5.303
## 3rd Qu.: 7.600 3rd Qu.: 0.400 3rd Qu.:0.390 3rd Qu.: 8.000
## Max. :14.200 Max. :1185.000 Max. :1.660 Max. :22.600
## chlorides free sulfur dioxide total sulfur dioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. :0.9871
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 78.0 1st Qu.:0.9923
## Median :0.04700 Median : 29.00 Median :118.0 Median :0.9948
## Mean :0.05566 Mean : 30.48 Mean :115.6 Mean :0.9946
## 3rd Qu.:0.06400 3rd Qu.: 41.00 3rd Qu.:155.0 3rd Qu.:0.9968
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :1.0010
## pH sulphates alcohol quality
## Min. :2.72 Min. :0.2200 Min. : 8.00 Min. :3.000
## 1st Qu.:3.11 1st Qu.:0.4300 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.21 Median :0.5000 Median :10.30 Median :6.000
## Mean :3.22 Mean :0.5287 Mean :10.51 Mean :5.823
## 3rd Qu.:3.32 3rd Qu.:0.6000 3rd Qu.:11.30 3rd Qu.:6.000
## Max. :4.01 Max. :2.0000 Max. :14.00 Max. :9.000
## color x
## Length:6308 Min. : 1
## Class :character 1st Qu.:1658
## Mode :character Median :3258
## Mean :3261
## 3rd Qu.:4874
## Max. :6497
hist(df$quality)
summary(df$alcohol)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.00 9.50 10.30 10.51 11.30 14.00
which(is.na(df))
## integer(0)
sum(is.na(df))
## [1] 0
df <- na.omit(df)
summary(df)
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 3.800 Min. : 0.080 Min. :0.000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.: 0.230 1st Qu.:0.240 1st Qu.: 1.800
## Median : 7.000 Median : 0.290 Median :0.310 Median : 2.900
## Mean : 7.162 Mean : 1.347 Mean :0.316 Mean : 5.303
## 3rd Qu.: 7.600 3rd Qu.: 0.400 3rd Qu.:0.390 3rd Qu.: 8.000
## Max. :14.200 Max. :1185.000 Max. :1.660 Max. :22.600
## chlorides free sulfur dioxide total sulfur dioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. :0.9871
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 78.0 1st Qu.:0.9923
## Median :0.04700 Median : 29.00 Median :118.0 Median :0.9948
## Mean :0.05566 Mean : 30.48 Mean :115.6 Mean :0.9946
## 3rd Qu.:0.06400 3rd Qu.: 41.00 3rd Qu.:155.0 3rd Qu.:0.9968
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :1.0010
## pH sulphates alcohol quality
## Min. :2.72 Min. :0.2200 Min. : 8.00 Min. :3.000
## 1st Qu.:3.11 1st Qu.:0.4300 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.21 Median :0.5000 Median :10.30 Median :6.000
## Mean :3.22 Mean :0.5287 Mean :10.51 Mean :5.823
## 3rd Qu.:3.32 3rd Qu.:0.6000 3rd Qu.:11.30 3rd Qu.:6.000
## Max. :4.01 Max. :2.0000 Max. :14.00 Max. :9.000
## color x
## Length:6308 Min. : 1
## Class :character 1st Qu.:1658
## Mode :character Median :3258
## Mean :3261
## 3rd Qu.:4874
## Max. :6497
which(is.na(df))
## integer(0)
sum(is.na(df))
## [1] 0
summary(df$alcohol)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.00 9.50 10.30 10.51 11.30 14.00
summary(df)
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 3.800 Min. : 0.080 Min. :0.000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.: 0.230 1st Qu.:0.240 1st Qu.: 1.800
## Median : 7.000 Median : 0.290 Median :0.310 Median : 2.900
## Mean : 7.162 Mean : 1.347 Mean :0.316 Mean : 5.303
## 3rd Qu.: 7.600 3rd Qu.: 0.400 3rd Qu.:0.390 3rd Qu.: 8.000
## Max. :14.200 Max. :1185.000 Max. :1.660 Max. :22.600
## chlorides free sulfur dioxide total sulfur dioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. :0.9871
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 78.0 1st Qu.:0.9923
## Median :0.04700 Median : 29.00 Median :118.0 Median :0.9948
## Mean :0.05566 Mean : 30.48 Mean :115.6 Mean :0.9946
## 3rd Qu.:0.06400 3rd Qu.: 41.00 3rd Qu.:155.0 3rd Qu.:0.9968
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :1.0010
## pH sulphates alcohol quality
## Min. :2.72 Min. :0.2200 Min. : 8.00 Min. :3.000
## 1st Qu.:3.11 1st Qu.:0.4300 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.21 Median :0.5000 Median :10.30 Median :6.000
## Mean :3.22 Mean :0.5287 Mean :10.51 Mean :5.823
## 3rd Qu.:3.32 3rd Qu.:0.6000 3rd Qu.:11.30 3rd Qu.:6.000
## Max. :4.01 Max. :2.0000 Max. :14.00 Max. :9.000
## color x
## Length:6308 Min. : 1
## Class :character 1st Qu.:1658
## Mode :character Median :3258
## Mean :3261
## 3rd Qu.:4874
## Max. :6497
Observations from the Summary Mean residual sugar level is 5.4 g/l. Mean free sulfur dioxide is 30.5 ppm. Max value is 289 which is quite high as 75% is 41 ppm. PH of wine is within range from 2.7 till 4, mean 3.2.
Alcohol: lightest wine is 8%.
Minimum quality mark is 3, mean 5.8, highest is 9.
cor(x=df[,1:12], y=df$quality)
## [,1]
## fixed_acidity -0.07696280
## volatile_acidity -0.05507775
## citric_acid 0.09126767
## residual_sugar -0.03114074
## chlorides -0.19962943
## free sulfur dioxide 0.05655009
## total sulfur dioxide -0.03805021
## density -0.31937078
## pH 0.01851999
## sulphates 0.03809322
## alcohol 0.44917292
## quality 1.00000000
#sulphates:0,2514
#citric acid : 0.2264
#total sulfur dioxide : -0.18
#alcolhol:0,0226
summary(df$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.823 6.000 9.000
table(df$ quality)
##
## 3 4 5 6 7 8 9
## 28 213 2059 2759 1053 191 5
install.packages(“ggplot2”)
library(ggplot2)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.3
qplot(quality, data = df, fill = color, binwidth = 1) +
scale_x_continuous(breaks = seq(3,10,1), lim = c(3,10))
## Warning: Removed 4 rows containing missing values (geom_bar).
summary(df$alcohol)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.00 9.50 10.30 10.51 11.30 14.00
qplot(alcohol, data=df, fill =color, binwidth = 0.5)+ scale_x_continuous(breaks = seq(8,15,0.5), lim = c(8,15))
## Warning: Removed 4 rows containing missing values (geom_bar).
qplot(quality, data =df, binwidth = 1, color = color, geom = "density") +
scale_x_continuous(breaks = seq(3, 9, 1))
## Warning: Ignoring unknown parameters: binwidth
install.packages(“caTools”)
library(caTools)
## Warning: package 'caTools' was built under R version 4.1.3
set.seed(150)
split = sample.split(df$quality, SplitRatio = 0.8)