tidyverse_extend vignette

OVERVIEW

TidyVerse CREATE assignment

Your task here is to Create an Example. Using one or more TidyVerse packages, and any dataset from fivethirtyeight.com or Kaggle, create a programming sample “vignette” that demonstrates how to use one or more of the capabilities of the selected TidyVerse package with your selected dataset. (25 points)

We are using the kaggle URL: https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009?select=winequality-red.csv

WINE_QUALITY_RAW_URL <- 'https://raw.githubusercontent.com/quaere1verum/sps_public/master/data607-001/assignments/tidyverse_assignments/winequality-red.csv'
library(curl)
# using tidy verse packages
# -- Attaching packages -------------------------------------------------------------------------------------------------- tidyverse 1.3.1 --
#   v ggplot2 3.3.5     v purrr   0.3.4
# v tibble  3.1.3     v stringr 1.4.0
# v tidyr   1.1.3     v forcats 0.5.1
# v readr   2.0.1     
library(tidyverse)

Using readr package from tidyverse

Readr package contains the read_csv function required to load our data.

load_csv_from_url <- function(url_path)
{
  tmp <- tempfile()
  curl_download(url_path, tmp)
  read_csv(tmp)               # read_csv() requires dependencies from tidyverse, readr package 
}
wine_quality_data <- load_csv_from_url(WINE_QUALITY_RAW_URL)

Using purr package from tidyverse

Purr package has functions like maps data to functions and returns vectors.

# using purr package from tidyverse
wine_quality_data %>%
  split(.$alcohol) %>% # from base R
  map(~ lm(`volatile acidity` ~ `citric acid`, data = .)) %>%
  map(summary) %>%
  map_dbl("r.squared")  # map_dbl is a purr function returns double vectors

## Warning in summary.lm(.x[[i]], ...): essentially perfect fit: summary may be
## unreliable

## Warning in summary.lm(.x[[i]], ...): essentially perfect fit: summary may be
## unreliable

##              8.4              8.5              8.7              8.8 
##     1.0000000000     0.0000000000     1.0000000000     0.0000000000 
##                9             9.05              9.1              9.2 
##     0.1942945281     0.0000000000     0.0694344974     0.1951511024 
## 9.23333333333333             9.25              9.3              9.4 
##     0.0000000000     0.0000000000     0.0924041977     0.1810929038 
##              9.5             9.55 9.56666666666667              9.6 
##     0.2072390806     0.0000000000     0.0000000000     0.4695763488 
##              9.7              9.8              9.9             9.95 
##     0.4431093797     0.4039960680     0.3986926481     0.0000000000 
##               10 10.0333333333333             10.1             10.2 
##     0.1256507920     0.0000000000     0.2271160021     0.4156351741 
##             10.3             10.4             10.5            10.55 
##     0.2432631201     0.2610570548     0.3142775109     1.0000000000 
##             10.6             10.7            10.75             10.8 
##     0.4798595509     0.3693973762     0.0000000000     0.6250030646 
##             10.9               11 11.0666666666667             11.1 
##     0.4335736398     0.4951081778     0.0000000000     0.2040027947 
##             11.2             11.3             11.4             11.5 
##     0.2821644019     0.3984542424     0.3549279631     0.0968923827 
##             11.6             11.7             11.8             11.9 
##     0.3239283202     0.2284373428     0.4343813983     0.4426610006 
##            11.95               12             12.1             12.2 
##     0.0000000000     0.0442965369     0.1977712564     0.2247980355 
##             12.3             12.4             12.5             12.6 
##     0.6035459195     0.0001886136     0.5547656813     0.7547789777 
##             12.7             12.8             12.9               13 
##     0.4439088242     0.5196712422     0.5560564228     0.2758770359 
##             13.1             13.2             13.3             13.4 
##     1.0000000000     0.0000000000     0.6067816878     0.3591966286 
##             13.5 13.5666666666667             13.6               14 
##     0.0000000000     0.0000000000     0.3310912195     0.2316566723 
##             14.9 
##     0.0000000000

Using gplot package from tidyverse

Adding a few cool plots that show some kind of relationship between acidity and alcohol quantity and quality.

ggplot(wine_quality_data, aes(`fixed acidity`, `volatile acidity`, colour = alcohol)) + 
  geom_point()

ggplot(wine_quality_data, aes(`fixed acidity`, `volatile acidity`, colour = quality)) + 
  geom_point()

Extending - Bikram Barua

##  Stacked bar graph 
ggplot(wine_quality_data, aes(fill=`fixed acidity`, y=`volatile acidity`, x=quality)) + 
    geom_bar(position="stack", stat="identity") + ggtitle("Wine quality By Acidity")

library(ggpubr) # for arranging plots

## Warning: package 'ggpubr' was built under R version 4.0.5

# ex. 1
sulphur <- ggplot(wine_quality_data, aes( y=sulphates, x=alcohol)) + 
  geom_bar(position="stack", stat="identity") +
  ggtitle("Sulphur by Alcohol percent") +
  scale_fill_manual(values = c("cyan3","black","brown"))

# ex. 2
pH <- ggplot(wine_quality_data, aes( y=pH, x=`fixed acidity`)) + 
  geom_bar(position="stack", stat="identity") +
  ggtitle("pH By Acidity") +
  scale_fill_manual(values = c("greenyellow","grey80","orangered"))

alcohol <- ggplot(wine_quality_data, aes( y=alcohol, x=quality)) + 
  geom_bar(position="stack", stat="identity") +
  ggtitle("Alcohol percent By Quality") +
  scale_fill_manual(values = c("cyan3","black","brown"))


pH_quality <- ggplot(wine_quality_data, aes( y=pH, x=quality)) + 
  geom_bar(position="stack", stat="identity") +
  ggtitle("pH By Quality") +
  scale_fill_manual(values = c("greenyellow","grey80","orangered"))

acidity_quality <- ggplot(wine_quality_data, aes( y=`fixed acidity`, x=quality)) + 
  geom_bar(position="stack", stat="identity") +
  ggtitle("Acidity By Quality") +
  scale_fill_manual(values = c("greenyellow","grey80","orangered"))


desity_quality <- ggplot(wine_quality_data, aes( y=density, x=quality)) + 
  geom_bar(position="stack", stat="identity") +
  ggtitle("Density By Quality") +
  scale_fill_manual(values = c("greenyellow","grey80","orangered"))

# Put plots together
ggarrange(sulphur, pH, alcohol, pH_quality,acidity_quality,desity_quality, ncol = 2, nrow = 3)

library(dplyr)


total_wine_data <-  wine_quality_data   %>%  mutate(total_acidity = `fixed acidity` + `volatile acidity` + `citric acid` )

filter Observations where Quality > 6

quality5_data  <- total_wine_data %>%   select(c('total sulfur dioxide', 'pH', 'sulphates', alcohol, total_acidity, quality )) %>% filter(quality >= 5) %>% arrange(desc(quality))
quality5_data

## # A tibble: 1,536 x 6
##    `total sulfur dioxide`    pH sulphates alcohol total_acidity quality
##                     <dbl> <dbl>     <dbl>   <dbl>         <dbl>   <dbl>
##  1                     37  3.35      0.86    12.8          8.71       8
##  2                     13  3.23      0.82    12.6         11.1        8
##  3                     88  3.56      0.82    12.9          6.5        8
##  4                     29  2.88      0.82     9.8         13.6        8
##  5                     19  3.22      0.69    13.4         12.6        8
##  6                     17  3.15      0.92    11.7         10.3        8
##  7                     16  3.15      0.65    11           11.6        8
##  8                     16  3.15      0.65    11           11.6        8
##  9                     50  3.72      0.74    14            5.66       8
## 10                     45  3.46      0.74    12.7          8.46       8
## # ... with 1,526 more rows

ggplot2 package

Scatterplot

ggplot(data = quality5_data, aes(x = quality, y = alcohol)) + geom_point()

ggplot(data = quality5_data, aes(x = quality , y = total_acidity)) + geom_point(alpha = 0.5) + 
  labs(title = "Acidity vs. Quality") + theme_bw()

ggplot(data = quality5_data, aes(x = quality , y = sulphates)) + geom_point(alpha = 0.5) + 
  labs(title = "sulphates vs. Quality") + theme_bw()

ggplot(data = quality5_data, aes(x = quality , y = pH)) + geom_point(alpha = 0.5) + 
  labs(title = "pH vs. Quality") + theme_bw()

Conclusion

Based on the data there is no conclusive evidence to determine quality of red wine depending on acidity, pH value, sulphates and alcohol percent. It is found that good quality red wines have lower counts of the above in general compared to the regular wines. One can say, sulphates is definitely less in the good wines. Other than that, I did not find any conclusive evidence using the given dataset.