OVERVIEW
TidyVerse CREATE assignment
Your task here is to Create an Example. Using one or more TidyVerse packages, and any dataset from fivethirtyeight.com or Kaggle, create a programming sample “vignette” that demonstrates how to use one or more of the capabilities of the selected TidyVerse package with your selected dataset. (25 points)
We are using the kaggle URL: https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009?select=winequality-red.csv
WINE_QUALITY_RAW_URL <- 'https://raw.githubusercontent.com/quaere1verum/sps_public/master/data607-001/assignments/tidyverse_assignments/winequality-red.csv'
library(curl)
# using tidy verse packages
# -- Attaching packages -------------------------------------------------------------------------------------------------- tidyverse 1.3.1 --
# v ggplot2 3.3.5 v purrr 0.3.4
# v tibble 3.1.3 v stringr 1.4.0
# v tidyr 1.1.3 v forcats 0.5.1
# v readr 2.0.1
library(tidyverse) Using readr package from tidyverse
Readr package contains the read_csv function required to load our data.
load_csv_from_url <- function(url_path)
{
tmp <- tempfile()
curl_download(url_path, tmp)
read_csv(tmp) # read_csv() requires dependencies from tidyverse, readr package
}
wine_quality_data <- load_csv_from_url(WINE_QUALITY_RAW_URL)Using purr package from tidyverse
Purr package has functions like maps data to functions and returns vectors.
# using purr package from tidyverse
wine_quality_data %>%
split(.$alcohol) %>% # from base R
map(~ lm(`volatile acidity` ~ `citric acid`, data = .)) %>%
map(summary) %>%
map_dbl("r.squared") # map_dbl is a purr function returns double vectors## Warning in summary.lm(.x[[i]], ...): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(.x[[i]], ...): essentially perfect fit: summary may be
## unreliable
## 8.4 8.5 8.7 8.8
## 1.0000000000 0.0000000000 1.0000000000 0.0000000000
## 9 9.05 9.1 9.2
## 0.1942945281 0.0000000000 0.0694344974 0.1951511024
## 9.23333333333333 9.25 9.3 9.4
## 0.0000000000 0.0000000000 0.0924041977 0.1810929038
## 9.5 9.55 9.56666666666667 9.6
## 0.2072390806 0.0000000000 0.0000000000 0.4695763488
## 9.7 9.8 9.9 9.95
## 0.4431093797 0.4039960680 0.3986926481 0.0000000000
## 10 10.0333333333333 10.1 10.2
## 0.1256507920 0.0000000000 0.2271160021 0.4156351741
## 10.3 10.4 10.5 10.55
## 0.2432631201 0.2610570548 0.3142775109 1.0000000000
## 10.6 10.7 10.75 10.8
## 0.4798595509 0.3693973762 0.0000000000 0.6250030646
## 10.9 11 11.0666666666667 11.1
## 0.4335736398 0.4951081778 0.0000000000 0.2040027947
## 11.2 11.3 11.4 11.5
## 0.2821644019 0.3984542424 0.3549279631 0.0968923827
## 11.6 11.7 11.8 11.9
## 0.3239283202 0.2284373428 0.4343813983 0.4426610006
## 11.95 12 12.1 12.2
## 0.0000000000 0.0442965369 0.1977712564 0.2247980355
## 12.3 12.4 12.5 12.6
## 0.6035459195 0.0001886136 0.5547656813 0.7547789777
## 12.7 12.8 12.9 13
## 0.4439088242 0.5196712422 0.5560564228 0.2758770359
## 13.1 13.2 13.3 13.4
## 1.0000000000 0.0000000000 0.6067816878 0.3591966286
## 13.5 13.5666666666667 13.6 14
## 0.0000000000 0.0000000000 0.3310912195 0.2316566723
## 14.9
## 0.0000000000
Using gplot package from tidyverse
Adding a few cool plots that show some kind of relationship between acidity and alcohol quantity and quality.
ggplot(wine_quality_data, aes(`fixed acidity`, `volatile acidity`, colour = alcohol)) +
geom_point()ggplot(wine_quality_data, aes(`fixed acidity`, `volatile acidity`, colour = quality)) +
geom_point()Extending - Bikram Barua
## Stacked bar graph
ggplot(wine_quality_data, aes(fill=`fixed acidity`, y=`volatile acidity`, x=quality)) +
geom_bar(position="stack", stat="identity") + ggtitle("Wine quality By Acidity")library(ggpubr) # for arranging plots## Warning: package 'ggpubr' was built under R version 4.0.5
# ex. 1
sulphur <- ggplot(wine_quality_data, aes( y=sulphates, x=alcohol)) +
geom_bar(position="stack", stat="identity") +
ggtitle("Sulphur by Alcohol percent") +
scale_fill_manual(values = c("cyan3","black","brown"))
# ex. 2
pH <- ggplot(wine_quality_data, aes( y=pH, x=`fixed acidity`)) +
geom_bar(position="stack", stat="identity") +
ggtitle("pH By Acidity") +
scale_fill_manual(values = c("greenyellow","grey80","orangered"))
alcohol <- ggplot(wine_quality_data, aes( y=alcohol, x=quality)) +
geom_bar(position="stack", stat="identity") +
ggtitle("Alcohol percent By Quality") +
scale_fill_manual(values = c("cyan3","black","brown"))
pH_quality <- ggplot(wine_quality_data, aes( y=pH, x=quality)) +
geom_bar(position="stack", stat="identity") +
ggtitle("pH By Quality") +
scale_fill_manual(values = c("greenyellow","grey80","orangered"))
acidity_quality <- ggplot(wine_quality_data, aes( y=`fixed acidity`, x=quality)) +
geom_bar(position="stack", stat="identity") +
ggtitle("Acidity By Quality") +
scale_fill_manual(values = c("greenyellow","grey80","orangered"))
desity_quality <- ggplot(wine_quality_data, aes( y=density, x=quality)) +
geom_bar(position="stack", stat="identity") +
ggtitle("Density By Quality") +
scale_fill_manual(values = c("greenyellow","grey80","orangered"))
# Put plots together
ggarrange(sulphur, pH, alcohol, pH_quality,acidity_quality,desity_quality, ncol = 2, nrow = 3)library(dplyr)
total_wine_data <- wine_quality_data %>% mutate(total_acidity = `fixed acidity` + `volatile acidity` + `citric acid` ) filter Observations where Quality > 6
quality5_data <- total_wine_data %>% select(c('total sulfur dioxide', 'pH', 'sulphates', alcohol, total_acidity, quality )) %>% filter(quality >= 5) %>% arrange(desc(quality))
quality5_data## # A tibble: 1,536 x 6
## `total sulfur dioxide` pH sulphates alcohol total_acidity quality
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 37 3.35 0.86 12.8 8.71 8
## 2 13 3.23 0.82 12.6 11.1 8
## 3 88 3.56 0.82 12.9 6.5 8
## 4 29 2.88 0.82 9.8 13.6 8
## 5 19 3.22 0.69 13.4 12.6 8
## 6 17 3.15 0.92 11.7 10.3 8
## 7 16 3.15 0.65 11 11.6 8
## 8 16 3.15 0.65 11 11.6 8
## 9 50 3.72 0.74 14 5.66 8
## 10 45 3.46 0.74 12.7 8.46 8
## # ... with 1,526 more rows
ggplot2 package
Scatterplot
ggplot(data = quality5_data, aes(x = quality, y = alcohol)) + geom_point()ggplot(data = quality5_data, aes(x = quality , y = total_acidity)) + geom_point(alpha = 0.5) +
labs(title = "Acidity vs. Quality") + theme_bw()ggplot(data = quality5_data, aes(x = quality , y = sulphates)) + geom_point(alpha = 0.5) +
labs(title = "sulphates vs. Quality") + theme_bw()ggplot(data = quality5_data, aes(x = quality , y = pH)) + geom_point(alpha = 0.5) +
labs(title = "pH vs. Quality") + theme_bw()Conclusion
Based on the data there is no conclusive evidence to determine quality of red wine depending on acidity, pH value, sulphates and alcohol percent. It is found that good quality red wines have lower counts of the above in general compared to the regular wines. One can say, sulphates is definitely less in the good wines. Other than that, I did not find any conclusive evidence using the given dataset.