install.packages(“readxl”) install.packages(“caTools”) install.packages(“corrgram”) install.packages(“knitr”) install.packages(“ggplot2”) install.packages(“dplyr”) install.packages(“tidyr”)
library(ggplot2) library(GGally) library(readxl) library(caTools) library(corrgram) library(knitr) library(ggplot2) library(dplyr) library(tidyr) library(readxl)
library(readxl)
wine_red <- read_excel("wine-red2.xlsx")
View(wine_red)
library(readxl)
wine_white <- read_excel("wine-white2.xlsx")
View(wine_white)
red <- wine_red
white <- wine_white
# add categorical varialbles to both sets
red['color'] <- 'red'
white['color'] <- 'white'
# merge red wine and white wine datasets
data <- rbind(red, white)
head(data)
tail(data)
dim(data)
## [1] 6497 13
names(data)
## [1] "fixed acidity" "volatile acidity" "citric acid"
## [4] "residual sugar" "chlorides" "free sulfur dioxide"
## [7] "total sulfur dioxide" "density" "pH"
## [10] "sulphates" "alcohol" "quality"
## [13] "color"
names(data) <- c('fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides', 'free_sulfurdioxide', 'total_sulfurdioxide', 'density', 'pH','sulphates','alcohol','quality','color')
summary(data)
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500 1st Qu.: 1.800
## Median : 7.000 Median :0.2900 Median :0.3100 Median : 3.000
## Mean : 7.215 Mean :0.3397 Mean :0.3186 Mean : 5.443
## 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900 3rd Qu.: 8.100
## Max. :15.900 Max. :1.5800 Max. :1.6600 Max. :65.800
## chlorides free_sulfurdioxide total_sulfurdioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. : 0.99
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 77.0 1st Qu.: 0.99
## Median :0.04700 Median : 29.00 Median :118.0 Median : 0.99
## Mean :0.05603 Mean : 30.53 Mean :115.7 Mean : 96.81
## 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:156.0 3rd Qu.: 1.00
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :100196.00
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. : 8.0 Min. :3.000
## 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.5 1st Qu.:5.000
## Median :3.210 Median :0.5100 Median : 10.3 Median :6.000
## Mean :3.219 Mean :0.5313 Mean : 412.1 Mean :5.818
## 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.: 11.3 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :1490646.0 Max. :9.000
## color
## Length:6497
## Class :character
## Mode :character
##
##
##
str(data)
## tibble [6,497 × 13] (S3: tbl_df/tbl/data.frame)
## $ fixed_acidity : num [1:6497] 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile_acidity : num [1:6497] 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric_acid : num [1:6497] 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual_sugar : num [1:6497] 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num [1:6497] 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free_sulfurdioxide : num [1:6497] 11 25 15 17 11 13 15 15 9 17 ...
## $ total_sulfurdioxide: num [1:6497] 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num [1:6497] 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num [1:6497] 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num [1:6497] 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num [1:6497] 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : num [1:6497] 5 5 5 6 5 5 5 7 7 5 ...
## $ color : chr [1:6497] "red" "red" "red" "red" ...
#data$x <- 1:nrow(data)
#data
#str(data)
data$fixed_acidity <- as.numeric(as.character(data$fixed_acidity))
data$volatile_acidity <- as.numeric(as.character(data$volatile_acidity))
data$citric_acid <- as.numeric(as.character(data$citric_acid))
data$residual_sugar <- as.numeric(as.character(data$residual_sugar))
data$chlorides <- as.numeric(as.character(data$chlorides))
data$free_sulfurdioxide <- as.numeric(as.character(data$free_sulfurdioxide))
data$total_sulfurdioxide <- as.numeric(as.character(data$total_sulfurdioxide))
data$density <- as.numeric(as.character(data$density))
data$pH <- as.numeric(as.character(data$pH))
data$sulphates <- as.numeric(as.character(data$sulphates))
data$alcohol <- as.numeric(as.character(data$alcohol))
data$quality <- as.integer(as.character(data$quality))
# Explaratory Data Aanalysis (EDA) and Pre-Processing
summary(data)
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500 1st Qu.: 1.800
## Median : 7.000 Median :0.2900 Median :0.3100 Median : 3.000
## Mean : 7.215 Mean :0.3397 Mean :0.3186 Mean : 5.443
## 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900 3rd Qu.: 8.100
## Max. :15.900 Max. :1.5800 Max. :1.6600 Max. :65.800
## chlorides free_sulfurdioxide total_sulfurdioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. : 0.99
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 77.0 1st Qu.: 0.99
## Median :0.04700 Median : 29.00 Median :118.0 Median : 0.99
## Mean :0.05603 Mean : 30.53 Mean :115.7 Mean : 96.81
## 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:156.0 3rd Qu.: 1.00
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :100196.00
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. : 8.0 Min. :3.000
## 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.5 1st Qu.:5.000
## Median :3.210 Median :0.5100 Median : 10.3 Median :6.000
## Mean :3.219 Mean :0.5313 Mean : 412.1 Mean :5.818
## 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.: 11.3 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :1490646.0 Max. :9.000
## color
## Length:6497
## Class :character
## Mode :character
##
##
##
library(corrgram)
corrgram (data, order = TRUE , lower.panel=panel.conf)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
# to extract the values of the potential outliers based on the IQR criterion thanks to the boxplot.stats()$out function:
boxplot.stats(data$density)$out
## [1] 1.00100e+03 1.00100e+03 1.00020e+04 1.00040e+04 1.01030e+00 1.01030e+00
## [7] 1.00182e+05 1.03898e+00 1.00014e+05 1.00196e+05 1.00037e+05 1.00038e+05
## [13] 1.00038e+05
out <- boxplot.stats(data$density)$out
out_ind <- which(data$density %in% c(out))
out_ind
## [1] 1791 2643 2665 3180 3253 3263 3850 4381 4862 5020 5145 5614 5618
Q <- quantile(data$density, probs=c(.25, .75), na.rm = FALSE)
iqr <- IQR(data$density)
up <- Q[2]+1.5*iqr # Upper Range
low<- Q[1]-1.5*iqr # Lower Range
eliminated<- subset(data,
data$density > (Q[1] - 1.5*iqr) & data$density < (Q[2]+1.5*iqr))
eliminated
summary(eliminated)
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500 1st Qu.: 1.800
## Median : 7.000 Median :0.2900 Median :0.3100 Median : 3.000
## Mean : 7.214 Mean :0.3397 Mean :0.3184 Mean : 5.406
## 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900 3rd Qu.: 8.100
## Max. :15.900 Max. :1.5800 Max. :1.6600 Max. :26.050
## chlorides free_sulfurdioxide total_sulfurdioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. :0.9871
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 77.0 1st Qu.:0.9923
## Median :0.04700 Median : 29.00 Median :118.0 Median :0.9949
## Mean :0.05604 Mean : 30.51 Mean :115.6 Mean :0.9947
## 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:156.0 3rd Qu.:0.9969
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. : 8.0 Min. :3.000
## 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.5 1st Qu.:5.000
## Median :3.210 Median :0.5100 Median : 10.3 Median :6.000
## Mean :3.219 Mean :0.5313 Mean : 412.9 Mean :5.819
## 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.: 11.3 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :1490646.0 Max. :9.000
## color
## Length:6484
## Class :character
## Mode :character
##
##
##
boxplot(eliminated$alcohol, plot = FALSE)$out
## [1] 1490646 1006404 102411 10050
out <- boxplot.stats(data$alcohol)$out
out_ind <- which(data$alcohol %in% c(out))
out_ind
## [1] 653 1014 5518 6103
outliers1 <- boxplot(eliminated$alcohol, plot=FALSE)$out
df <- eliminated
df <- df[-which(df$alcohol %in% outliers1),]
summary(df)
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500 1st Qu.: 1.800
## Median : 7.000 Median :0.2900 Median :0.3100 Median : 3.000
## Mean : 7.213 Mean :0.3396 Mean :0.3185 Mean : 5.406
## 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900 3rd Qu.: 8.100
## Max. :15.600 Max. :1.5800 Max. :1.6600 Max. :26.050
## chlorides free_sulfurdioxide total_sulfurdioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. :0.9871
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 77.0 1st Qu.:0.9923
## Median :0.04700 Median : 29.00 Median :118.0 Median :0.9949
## Mean :0.05603 Mean : 30.51 Mean :115.6 Mean :0.9947
## 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:156.0 3rd Qu.:0.9969
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. : 8.00 Min. :3.000
## 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.210 Median :0.5100 Median :10.30 Median :6.000
## Mean :3.219 Mean :0.5313 Mean :10.49 Mean :5.818
## 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:11.30 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.00 Max. :9.000
## color
## Length:6480
## Class :character
## Mode :character
##
##
##
hist(df$quality)
summary(df$alcohol)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.00 9.50 10.30 10.49 11.30 14.00
which(is.na(df))
## integer(0)
sum(is.na(df))
## [1] 0
df <- na.omit(df)
summary(df)
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500 1st Qu.: 1.800
## Median : 7.000 Median :0.2900 Median :0.3100 Median : 3.000
## Mean : 7.213 Mean :0.3396 Mean :0.3185 Mean : 5.406
## 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900 3rd Qu.: 8.100
## Max. :15.600 Max. :1.5800 Max. :1.6600 Max. :26.050
## chlorides free_sulfurdioxide total_sulfurdioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. :0.9871
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 77.0 1st Qu.:0.9923
## Median :0.04700 Median : 29.00 Median :118.0 Median :0.9949
## Mean :0.05603 Mean : 30.51 Mean :115.6 Mean :0.9947
## 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:156.0 3rd Qu.:0.9969
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. : 8.00 Min. :3.000
## 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.210 Median :0.5100 Median :10.30 Median :6.000
## Mean :3.219 Mean :0.5313 Mean :10.49 Mean :5.818
## 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:11.30 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.00 Max. :9.000
## color
## Length:6480
## Class :character
## Mode :character
##
##
##
which(is.na(df))
## integer(0)
sum(is.na(df))
## [1] 0
summary(df$alcohol)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.00 9.50 10.30 10.49 11.30 14.00
summary(df)
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500 1st Qu.: 1.800
## Median : 7.000 Median :0.2900 Median :0.3100 Median : 3.000
## Mean : 7.213 Mean :0.3396 Mean :0.3185 Mean : 5.406
## 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900 3rd Qu.: 8.100
## Max. :15.600 Max. :1.5800 Max. :1.6600 Max. :26.050
## chlorides free_sulfurdioxide total_sulfurdioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. :0.9871
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 77.0 1st Qu.:0.9923
## Median :0.04700 Median : 29.00 Median :118.0 Median :0.9949
## Mean :0.05603 Mean : 30.51 Mean :115.6 Mean :0.9947
## 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:156.0 3rd Qu.:0.9969
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. : 8.00 Min. :3.000
## 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.210 Median :0.5100 Median :10.30 Median :6.000
## Mean :3.219 Mean :0.5313 Mean :10.49 Mean :5.818
## 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:11.30 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.00 Max. :9.000
## color
## Length:6480
## Class :character
## Mode :character
##
##
##
Observations from the Summary Mean residual sugar level is 5.4 g/l. Mean free sulfur dioxide is 30.5 ppm. Max value is 289 which is quite high as 75% is 41 ppm. PH of wine is within range from 2.7 till 4, mean 3.2.
Alcohol: lightest wine is 8%. the highest amount is 14%
Minimum quality mark is 3, mean 5.8, highest is 9.
library(corrgram)
corrgram (df, order = TRUE , lower.panel=panel.conf)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
cor(x=df[,1:12], y=df$quality)
## [,1]
## fixed_acidity -0.07549878
## volatile_acidity -0.26696446
## citric_acid 0.08717157
## residual_sugar -0.03748329
## chlorides -0.20059999
## free_sulfurdioxide 0.05553655
## total_sulfurdioxide -0.04154195
## density -0.31314424
## pH 0.01895031
## sulphates 0.03844359
## alcohol 0.44541285
## quality 1.00000000
#alcohol: 0,445
#citric acid : 0.087
#free sulfur dioxide: 0.056
#sulphates:0,038
#pH : 0.01895031
#density: -0.313
#volatile_acidity: -0.27
#chlorides: -0.200
#fixed_acidity: -0.076
#total sulfur dioxide: -0.042
#residual_sugar:-0.037
#Quality is mostly correlated with amount of alcohol.
#It looks like the biggest positive correlation with quality has free sulfur dioxide (0.23), sulphates (0.25) and alcohol(0.48). Negative correlation exist for the volatile acidity(-0.39), chlorides(-0.27), total.sulfur.dioxide(-0.19) and density(-0.17).
summary(df$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.818 6.000 9.000
table(df$ quality)
##
## 3 4 5 6 7 8 9
## 30 216 2132 2828 1076 193 5
library(ggplot2)
qplot(quality, data = df, fill = color, binwidth = 1) +
scale_x_continuous(breaks = seq(3,10,1), lim = c(3,10))
## Warning: Removed 4 rows containing missing values (geom_bar).
summary(df$alcohol)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.00 9.50 10.30 10.49 11.30 14.00
qplot(alcohol, data=df, fill =color, binwidth = 0.5)+ scale_x_continuous(breaks = seq(8,15,0.5), lim = c(8,15))
## Warning: Removed 4 rows containing missing values (geom_bar).
qplot(quality, data =df, binwidth = 1, color = color, geom = "density") +
scale_x_continuous(breaks = seq(3, 9, 1))
## Warning: Ignoring unknown parameters: binwidth
ffixed_acidity <- as.factor(as.numeric(data$fixed_acidity))
fvolatile_acidity <- as.factor(as.numeric(data$volatile_acidity))
fcitric_acid <- as.factor(as.numeric(data$citric_acid))
fresidual_sugar <- as.factor(as.numeric(data$residual_sugar))
fchlorides <- as.factor(as.numeric(data$chlorides))
ffree_sulfurdioxide <- as.factor(as.numeric(data$free_sulfurdioxide))
ftotal_sulfurdioxide <- as.factor(as.numeric(data$total_sulfurdioxide))
fdensity <- as.factor(as.numeric(data$density))
fpH <- as.factor(as.numeric(data$pH))
fsulphates <- as.factor(as.numeric(data$sulphates))
falcohol <- as.factor(as.numeric(data$alcohol))
fquality <- as.factor(as.integer(data$quality))
quality_min <- min(df$quality)
quality_max <- max(df$quality)
quality_mean <- mean(df$quality)
quality_median <- median(df$quality)
quality_iqr <- IQR(df$quality)
quality_q1 <- quality_median - quality_iqr
quality_q3 <- quality_median + quality_iqr
alcohol_mean <- mean(df$alcohol)
#Visualization of Wine Quality
ggplot(df, aes(x=quality, fill = quality)) +
geom_bar(stat="count") +
geom_text(position = "stack", stat='count',aes(label=..count..), vjust = -0.5)+
labs(y="Num of Observations", x="Wine Quality") +
labs(title= "Distribution of Wine Quality Ratings")
###We can see quality is not balanced across its entire range of 0-10. Most of the numbers are around 5 or 6. In other words, there are much more normal wines than very excellent or poor ones.
#df%>%
# gather(df$quality, key = "var", value = "value") %>%
# ggplot(aes(x = quality, y = value, color = quality)) +
# geom_boxplot() +
# facet_wrap(~ var, scales = "free", ncol = 3)+
# theme(legend.position="none")
wine2 = scale(df [,1:12])
wine_scaled= cbind.data.frame(wine2 , df$quality)
colnames(wine_scaled) = colnames(df)
head(wine_scaled)
#install.packages("FactoMineR")
#install.packages("factoextra")
library(FactoMineR)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
set.seed(150)
res = PCA( wine_scaled , quali.sup=12 ,graph = FALSE )
fviz_pca_biplot(res ,geom.ind = c("point"),habillage = 12 , col.var = "black" , repel=TRUE)
library(caTools)
set.seed(150)
split = sample.split(df$quality, SplitRatio = 0.8)
training_set = subset(df, split == TRUE)
test_set = subset(df, split == FALSE)
#Logistic Regression
#log_reg <- glm(df$quality ~ df$density, #data=training_set, family = binomial)
#summary(log_reg)
#Decision Tree Regression
#library(rpart)
#classifier = rpart(formula = quality ~ .,
# data=training_set)
#y_pred = predict(classifier, newdata = test_set[-12],type="vector")
#cm = as.matrix(table(actual=test_set$quality,predicted=y_pred))
#cm
#RANDOM FOREST
#library(randomForest)
#set.seed(150)
#classifier1 = randomForest(x = training_set[-12],
# y = training_set$quality,
# ntree = 500)
#redicting the Test set results
#y_pred1 = predict(classifier1, newdata = test_set[-12])
#summary(y_pred1)
# the Confusion Matrix
#cm1 = table(actual=test_set[, 12],predicted= y_pred1)
#summary(cm1)