TITLE by YOUR_NAME_HERE title=c(“Exploring Red Wine Analysis by Laasya Dyavarashetty”) ========================================================
library(ggplot2) library(dplyr) library(GGally) library(scales) library(memisc) library(reshape) library(gridExtra)
redwine<-read.csv(‘wineQualityReds.csv’) #To summarize the data summary(redwine) #To find the names of the columns of the data names(redwine)
grid.arrange(qplot(redwine\(fixed.acidity), qplot(redwine\)volatile.acidity), qplot(redwine\(citric.acid), qplot(redwine\)residual.sugar), qplot(redwine\(chlorides), qplot(redwine\)free.sulfur.dioxide), qplot(redwine\(total.sulfur.dioxide), qplot(redwine\)density), qplot(redwine\(pH), qplot(redwine\)sulphates), qplot(redwine\(alcohol), qplot(redwine\)quality), ncol = 4)
redwine\(rate <- ifelse(redwine\)quality < 5, ‘good’, ifelse( redwine\(quality < 7, 'outstanding', 'classic')) redwine\)rate <- ordered(redwine\(rate,levels = c('good', 'outstanding','classic')) summary(redwine\)rate) qplot(redwine$rate)
ggplot(data = redwine, aes(x = fixed.acidity)) + geom_histogram() + scale_x_log10() #Over volatile.acidity ggplot(data = redwine, aes(x = volatile.acidity)) + geom_histogram() + scale_x_log10() #Over citric.acid ggplot(data = redwine, aes(x = citric.acid)) + geom_histogram() + scale_x_log10()
length(subset(redwine, citric.acid == 0)$citric.acid) #This results in 132 observations whose citric.acid value is zero.
p1 <- ggplot(data = redwine, aes(x = residual.sugar)) + geom_histogram() + scale_x_continuous(lim = c(0, quantile(redwine$residual.sugar, 0.95))) + xlab(‘residual.sugar, 95th percentile truncated’)
p2 <- p1 + scale_x_log10() + xlab(‘residual.sugar, log10’) grid.arrange(p1, p2, ncol=1)
p1 <- ggplot(data = redwine, aes(x = chlorides)) + geom_histogram() + scale_x_continuous(lim = c(0, quantile(redwine$chlorides, 0.95))) + xlab(‘chlorides, 95th percentile truncated’)
p2 <- p1 + scale_x_log10() + xlab(‘chlorides, log10’) grid.arrange(p1, p2, ncol=1)
p1 <- ggplot(data = redwine, aes(x = sulphates)) + geom_histogram() + scale_x_continuous(lim = c(0, quantile(redwine$sulphates, 0.95))) + xlab(‘sulphates, 95th percentile truncated’)
p2 <- p1 + scale_x_log10() + xlab(‘sulphates, log10’) grid.arrange(p1, p2, ncol=1)
rm(p1, p2)
redwine\(track.acidity <- redwine\)fixed.acidity + redwine\(volatile.acidity + redwine\)citric.acid qplot(redwine$track.acidity)
get_simple_boxplot <- function(column, ylab) { return(qplot(data = redwine, x = ‘simple’, y = column, geom = ‘boxplot’, xlab = ’’, ylab = ylab)) }
grid.arrange(get_simple_boxplot(redwine\(fixed.acidity, 'fixed acidity'), get_simple_boxplot(redwine\)volatile.acidity, ‘volatile acidity’), get_simple_boxplot(redwine\(citric.acid, 'citric acid'), get_simple_boxplot(redwine\)track.acidity, ‘track acidity’), get_simple_boxplot(redwine\(residual.sugar, 'residual sugar'), get_simple_boxplot(redwine\)chlorides, ‘chlorides’), get_simple_boxplot(redwine\(free.sulfur.dioxide, 'free sulf. dioxide'), get_simple_boxplot(redwine\)total.sulfur.dioxide, ‘total sulf. dioxide’), get_simple_boxplot(redwine\(density, 'density'), get_simple_boxplot(redwine\)pH, ‘pH’), get_simple_boxplot(redwine\(sulphates, 'sulphates'), get_simple_boxplot(redwine\)alcohol, ‘alcohol’), ncol = 4)
str(redwine)
set.seed(1) redwine_sample <- redwine[,-which(names(redwine) %in% c(‘X’, ‘rate’))][sample(1:length(redwine$quality), 40), ] ggpairs(redwine_sample, params = c(shape = I(‘.’), outlier.shape = I(‘.’))) get_bivariate_boxplot <- function(x, y, ylab) { return(qplot(data = redwine, x = x, y = y, geom = ‘boxplot’, ylab = ylab)) }
grid.arrange(get_bivariate_boxplot(redwine\(quality, redwine\)fixed.acidity, ‘fixed acidity’), get_bivariate_boxplot(redwine\(quality, redwine\)volatile.acidity, ‘volatile acidity’), get_bivariate_boxplot(redwine\(quality, redwine\)citric.acid, ‘citric acid’), get_bivariate_boxplot(redwine\(quality, redwine\)track.acidity, ‘track acidity’), get_bivariate_boxplot(redwine\(quality, log10(redwine\)residual.sugar), ‘residual sugar’), get_bivariate_boxplot(redwine\(quality, log10(redwine\)chlorides), ‘chlorides’), get_bivariate_boxplot(redwine\(quality, redwine\)free.sulfur.dioxide, ‘free sulf. dioxide’), get_bivariate_boxplot(redwine\(quality, redwine\)total.sulfur.dioxide, ‘total sulf. dioxide’), get_bivariate_boxplot(redwine\(quality, redwine\)density, ‘density’), get_bivariate_boxplot(redwine\(quality, redwine\)pH, ‘pH’), get_bivariate_boxplot(redwine\(quality, log10(redwine\)sulphates), ‘sulphates’), get_bivariate_boxplot(redwine\(quality, redwine\)alcohol, ‘alcohol’), ncol = 4)
grid.arrange(get_bivariate_boxplot(redwine\(rating, redwine\)fixed.acidity, ‘fixed acidity’), get_bivariate_boxplot(redwine\(rating, redwine\)volatile.acidity, ‘volatile acidity’), get_bivariate_boxplot(redwine\(rating, redwine\)citric.acid, ‘citric acid’), get_bivariate_boxplot(redwine\(rating, redwine\)track.acidity, ‘track acidity’), get_bivariate_boxplot(redwine\(rating, log10(redwine\)residual.sugar), ‘residual sugar’), get_bivariate_boxplot(redwine\(rating, log10(redwine\)chlorides), ‘chlorides’), get_bivariate_boxplot(redwine\(rating, redwine\)free.sulfur.dioxide, ‘free sulf. dioxide’), get_bivariate_boxplot(redwine\(rating, redwine\)total.sulfur.dioxide, ‘total sulf. dioxide’), get_bivariate_boxplot(redwine\(rating, redwine\)density, ‘density’), get_bivariate_boxplot(redwine\(rating, redwine\)pH, ‘pH’), get_bivariate_boxplot(redwine\(rating, log10(redwine\)sulphates), ‘sulphates’), get_bivariate_boxplot(redwine\(rating, redwine\)alcohol, ‘alcohol’), ncol = 4)
simple_cor_test <- function(x, y) { return(cor.test(x, as.numeric(y))$estimate) }
correlations <- c( simple_cor_test(redwine\(fixed.acidity, redwine\)quality), simple_cor_test(redwine\(volatile.acidity, redwine\)quality), simple_cor_test(redwine\(citric.acid, redwine\)quality), simple_cor_test(redwine\(track.acidity, redwine\)quality), simple_cor_test(log10(redwine\(residual.sugar), redwine\)quality), simple_cor_test(log10(redwine\(chlorides), redwine\)quality), simple_cor_test(redwine\(free.sulfur.dioxide, redwine\)quality), simple_cor_test(redwine\(total.sulfur.dioxide, redwine\)quality), simple_cor_test(redwine\(density, redwine\)quality), simple_cor_test(redwine\(pH, redwine\)quality), simple_cor_test(log10(redwine\(sulphates), redwine\)quality), simple_cor_test(redwine\(alcohol, redwine\)quality)) names(correlations) <- c(‘fixed.acidity’, ‘volatile.acidity’, ‘citric.acid’, ‘track.acidity’, ‘log10.residual.sugar’, ‘log10.chlordies’, ‘free.sulfur.dioxide’, ‘total.sulfur.dioxide’, ‘density’, ‘pH’, ‘log10.sulphates’, ‘alcohol’)
ggplot(data = redwine, aes(x = log10(sulphates), y = alcohol)) + facet_wrap(~rate) + geom_point()
ggplot(data = redwine, aes(x = volatile.acidity, y = alcohol)) + facet_wrap(~rate) + geom_point()
ggplot(data = redwine, aes(x = citric.acid, y = alcohol)) + facet_wrap(~rate) + geom_point()
ggplot(data = redwine, aes(x = volatile.acidity, y = log10(sulphates))) + facet_wrap(~rate) + geom_point()
ggplot(data = redwine, aes(x = citric.acid, y = log10(sulphates))) + facet_wrap(~rate) + geom_point()
ggplot(data = redwine, aes(x = citric.acid, y = volatile.acidity)) + facet_wrap(~rate) + geom_point()
ggplot(data = redwine, aes(x = fixed.acidity, y = citric.acid)) + geom_point() cor.test(redwine\(fixed.acidity, redwine\)citric.acid)
ggplot(data = redwine, aes(x = volatile.acidity, y = citric.acid)) + geom_point() cor.test(redwine\(volatile.acidity, redwine\)citric.acid)
ggplot(data = redwine, aes(x = log10(track.acidity), y = pH)) + geom_point() cor.test(log10(redwine\(track.acidity), redwine\)pH)
w <- lm(I(pH) ~ I(log10(track.acidity)), data = redwine) redwine\(pH.predictions <- predict(w, redwine) # (observed - expected) / expected redwine\)pH.error <- (redwine\(pH.predictions - redwine\)pH)/redwine$pH
ggplot(data = redwine, aes(x = quality, y = pH.error)) + geom_boxplot()
ggplot(data = redwine, aes(x = free.sulfur.dioxide, y = total.sulfur.dioxide)) + geom_point() + geom_smooth()
cor.test(redwine\(free.sulfur.dioxide, redwine\)total.sulfur.dioxide)
There are more number of relationships that found very interesting which should be explored in depth were the alcohol, acidity and sulphates
alcohol and quality plots are very interesting as they grow when they increse are decrease
Alcohol vs quality of red wine
ggplot(data = redwine, aes(x = citric.acid, y = volatile.acidity, color = quality)) + geom_point() + facet_wrap(~rate)
ggplot(data = redwine, aes(x = alcohol, y = log10(sulphates), color = quality)) + geom_point() + facet_wrap(~rate)
ggplot(data = redwine, aes(x = pH, y = alcohol, color = quality)) + geom_point() + facet_wrap(~rate)
grid.arrange(ggplot(data = redwine, aes(x = quality, y = fixed.acidity, fill = quality)) + ylab(‘Fixed Acidity (g/dm^3)’) + xlab(‘Quality’) + geom_boxplot(), ggplot(data = redwine, aes(x = quality, y = volatile.acidity, fill = quality)) + ylab(‘Volatile Acidity (g/dm^3)’) + xlab(‘Quality’) + geom_boxplot(), ggplot(data = redwine, aes(x = quality, y = citric.acid, fill = quality)) + ylab(‘Citric Acid (g/dm^3)’) + xlab(‘Quality’) + geom_boxplot(), ggplot(data = redwine, aes(x = quality, y = pH, fill = quality)) + ylab(‘pH’) + xlab(‘Quality’) + geom_boxplot())
ggplot(data = redwine, aes(x = quality, y = alcohol, fill = rate)) + geom_boxplot() + ggtitle(‘Quality of wine based on alcohol levels’) + xlab(‘Quality’) + ylab(‘Alcohol (% volume)’)
ggplot(data = subset(redwine, rate != ‘average’), aes(x = volatile.acidity, y = alcohol, color = rate)) + geom_point() + ggtitle(‘Alcohol vs. Volatile Acidity and Wine Quality’) + xlab(‘Volatile Acidity (g / dm^3)’) + ylab(‘Alcohol (% volume)’)