library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.3 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(ggcorrplot)
library(vcd)
## Loading required package: grid
library(ade4)
library(ggfortify)
beers <- read.csv("beer_profile_and_ratings.csv")
head(beers)
General 1. Name: Beer name (label) 2. Style: Beer Style 3. Brewery: Brewery name 4. Beer Name: Complete beer name (Brewery + Brew Name) 5. Description: Notes on the beer if available 6. ABV: Alcohol content of beer (% by volume) 7. Min IBU: The minimum IBU value each beer can possess 8. Max IBU: The maximum IBU value each beer can possess
Mouth feel 9. Astringency 10. Body 11. Alcohol
Taste 12. Bitter 13. Sweet 14. Sour 15. Salty
Flavor And Aroma 16. Fruits 17. Hoppy 18. Spices 19. Malty
Reviews 20. review_aroma 21. review_appearance 22. review_palate 23. review_taste 24. review_overall 25. number_of_reviews
summary(beers)
## Name Style Brewery Beer.Name..Full.
## Length:3197 Length:3197 Length:3197 Length:3197
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Description ABV Min.IBU Max.IBU
## Length:3197 Min. : 0.000 Min. : 0.00 Min. : 0.00
## Class :character 1st Qu.: 5.000 1st Qu.:15.00 1st Qu.: 25.00
## Mode :character Median : 6.000 Median :20.00 Median : 35.00
## Mean : 6.527 Mean :21.18 Mean : 38.99
## 3rd Qu.: 7.600 3rd Qu.:25.00 3rd Qu.: 45.00
## Max. :57.500 Max. :65.00 Max. :100.00
## Astringency Body Alcohol Bitter
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 9.00 1st Qu.: 29.00 1st Qu.: 6.00 1st Qu.: 17.00
## Median :14.00 Median : 40.00 Median : 11.00 Median : 31.00
## Mean :16.52 Mean : 46.13 Mean : 17.06 Mean : 36.36
## 3rd Qu.:21.00 3rd Qu.: 58.00 3rd Qu.: 22.00 3rd Qu.: 52.00
## Max. :81.00 Max. :175.00 Max. :139.00 Max. :150.00
## Sweet Sour Salty Fruits
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 33.00 1st Qu.: 11.00 1st Qu.: 0.000 1st Qu.: 12.00
## Median : 54.00 Median : 22.00 Median : 0.000 Median : 29.00
## Mean : 58.27 Mean : 33.15 Mean : 1.017 Mean : 38.53
## 3rd Qu.: 77.00 3rd Qu.: 42.00 3rd Qu.: 1.000 3rd Qu.: 60.00
## Max. :263.00 Max. :284.00 Max. :48.000 Max. :175.00
## Hoppy Spices Malty review_aroma
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. :1.510
## 1st Qu.: 18.00 1st Qu.: 4.00 1st Qu.: 45.00 1st Qu.:3.423
## Median : 33.00 Median : 10.00 Median : 73.00 Median :3.720
## Mean : 40.92 Mean : 18.35 Mean : 75.33 Mean :3.639
## 3rd Qu.: 56.00 3rd Qu.: 23.00 3rd Qu.:103.00 3rd Qu.:3.978
## Max. :172.00 Max. :184.00 Max. :239.00 Max. :5.000
## review_appearance review_palate review_taste review_overall
## Min. :1.571 Min. :1.286 Min. :1.214 Min. :1.136
## 1st Qu.:3.605 1st Qu.:3.470 1st Qu.:3.500 1st Qu.:3.567
## Median :3.833 Median :3.742 Median :3.792 Median :3.830
## Mean :3.754 Mean :3.660 Mean :3.702 Mean :3.748
## 3rd Qu.:4.000 3rd Qu.:3.966 3rd Qu.:4.033 3rd Qu.:4.033
## Max. :4.667 Max. :5.000 Max. :5.000 Max. :5.000
## number_of_reviews
## Min. : 1.0
## 1st Qu.: 23.0
## Median : 93.0
## Mean : 233.3
## 3rd Qu.: 284.0
## Max. :3290.0
## There is no missing data
colSums(is.na(beers))
## Name Style Brewery Beer.Name..Full.
## 0 0 0 0
## Description ABV Min.IBU Max.IBU
## 0 0 0 0
## Astringency Body Alcohol Bitter
## 0 0 0 0
## Sweet Sour Salty Fruits
## 0 0 0 0
## Hoppy Spices Malty review_aroma
## 0 0 0 0
## review_appearance review_palate review_taste review_overall
## 0 0 0 0
## number_of_reviews
## 0
bs <- read.csv("beer_styles.csv")
bs
ggplot(data = bs) +
geom_point(aes(x = Style, y = Average.of.review_aroma, color = 'red'), size = 4, alpha = 0.4) +
geom_point(aes(x = Style, y = Average.of.review_appearance, color = 'blue'), size = 5, alpha = 0.4) +
geom_point(aes(x = Style, y = Average.of.review_palate, color = 'green'), size = 6, alpha = 0.4) +
geom_point(aes(x = Style, y = Average.of.review_taste, color = 'purple'), size = 7, alpha = 0.4) +
geom_point(aes(x = Style, y = Average.of.review_overall, color = 'gray'), size = 8, alpha = 0.4) +
scale_color_discrete(name = "Metric", labels = c("Aroma", "Appearance", "Palate", "Taste", "Overall")) +
xlab('Beer Styles') +
ylab('Average Scores') +
theme_bw() +
theme(axis.text.x = element_text(angle=90, hjust=1))
br <- read.csv("reviews.csv")
br
ggplot(data = br) +
geom_col(aes(x = Style, y = number_of_reviews), fill = 'orange', alpha = 0.8) +
xlab('Beer Styles') +
ylab('Number of Reviews') +
theme_bw() +
theme(axis.text.x = element_text(angle=90, hjust=1))
beers_num <- subset(beers, select = c(ABV, Min.IBU, Max.IBU, Astringency, Body, Alcohol, Bitter, Sweet, Sour, Salty, Fruits, Hoppy, Spices, Malty))
beers_num
beers_num_long <- melt(beers_num)
## No id variables; using all as measure variables
ggplot(data = beers_num_long, aes(x = variable, y = value)) +
geom_boxplot(aes(fill = variable, alpha = 0.6)) +
theme_bw() +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5))
ggplot(data = beers) +
geom_density(aes(x = review_aroma, color = 'coral'), alpha = 0.7, size = 1) +
geom_density(aes(x = review_appearance, color = 'lightblue'), alpha = 0.6, size = 1) +
geom_density(aes(x = review_palate, color = 'lightgreen'), alpha = 0.5, size = 1) +
geom_density(aes(x = review_taste, color = 'maroon'), alpha = 0.4, size = 1) +
geom_density(aes(x = review_overall, color = 'brown'), alpha = 0.3, size = 1) +
scale_color_discrete(name = "Metric", labels = c("Aroma", "Appearance", "Palate", "Taste", "Overall")) +
xlab('Reviews') +
ylab('Density') +
theme_bw()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(data = beers) +
stat_ecdf(aes(x = review_aroma, color = 'coral')) +
stat_ecdf(aes(x = review_appearance, color = 'lightblue')) +
stat_ecdf(aes(x = review_palate, color = 'lightgreen')) +
stat_ecdf(aes(x = review_taste, color = 'maroon')) +
stat_ecdf(aes(x = review_overall, color = 'brown')) +
scale_color_discrete(name = "Variable", labels = c("Aroma", "Palate", "Taste", "Appearance", "Overall")) +
xlab('Reviews') +
ylab('Density') +
theme_bw()
ggplot(data = beers) +
geom_smooth(aes(x = review_aroma,y= review_overall, color = 'coral'), se = FALSE, alpha = 0.1) +
geom_smooth(aes(x = review_palate,y= review_overall, color = 'red'), se = FALSE, alpha = 0.1) +
geom_smooth(aes(x = review_taste,y= review_overall, color = 'blue'), se = FALSE, alpha = 0.1) +
geom_smooth(aes(x = review_appearance,y= review_overall, color = 'black'), se = FALSE, alpha = 0.1) +
scale_color_discrete(name = "Metric", labels = c("Aroma", "Appearance", "Palate", "Taste")) +
xlab('Other Reviews') +
ylab('Overall Review') +
theme_bw()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
corr_data = subset(beers, select = c(6:24))
corr <- round(cor(corr_data), 2)
ggcorrplot(corr, type = 'upper', method = 'circle', colors = c("#6495ED", "#FFFFFF", "#FFA07A"))
ggplot(data = beers) +
geom_point(aes(x = log(Body), y = Astringency, color = review_overall), alpha = 0.5) +
facet_grid( ~ cut(Alcohol, breaks = 4)) +
scale_color_gradientn(colors = c("#F26744", "#ADE330", "#58ED98", "#95ADF5", "#F595CF")) +
theme_bw()
ggplot(data = beers) +
geom_point(aes(x = log(Sweet), y = Bitter, color = review_overall), alpha = 0.5) +
facet_grid(cut(Sour, breaks = 4) ~ cut(Alcohol, breaks = 4)) +
scale_color_gradientn(colors = c("#F26744", "#ADE330", "#58ED98", "#95ADF5", "#F595CF")) +
theme_bw()
ggplot(data = beers) +
geom_point(aes(x = log(Fruits), y = Hoppy, color = review_overall), alpha = 0.5) +
facet_grid(cut(Spices, breaks = 4) ~ cut(Malty, breaks = 4)) +
scale_color_gradientn(colors = c("#F26744", "#ADE330", "#58ED98", "#95ADF5", "#F595CF")) +
theme_bw()
beer_lm_base <- lm(formula = review_overall ~ Min.IBU * Max.IBU * Body * Bitter * Sweet * Sour * Fruits * Hoppy * Malty , data = beers)
lm_preds <- augment(beer_lm_base, type.predict = "response")
ggplot(data = lm_preds, aes(x = beers$review_overall, y = .resid)) +
geom_point(aes(color = .fitted), alpha = 0.4) +
scale_color_gradientn(colors = c("#F26744", "#ADE330", "#58ED98", "#95ADF5", "#F595CF")) +
labs(title = "Residual Plots for Linear Model with Actual Parameters")+
theme_bw()
pca_data <- subset(corr_data, select = c(1:14))
pca_data
beers_scale = prcomp(pca_data, scale. = TRUE)
beer_loadings = data.frame(beers_scale$rotation)[1:4]
beer_pc = data.frame(beers_scale$x[,0:4], beers$review_overall)
beer_pc
ggplot(data = beer_pc, aes(x = PC1, y = PC2)) +
geom_point(aes(color = beers.review_overall), alpha = 0.4) +
facet_grid(cut(PC3, breaks = 4) ~ cut(PC4, breaks = 4)) +
scale_color_gradientn(colors = c("#F26744", "#ADE330", "#58ED98", "#95ADF5", "#F595CF")) +
theme_bw()
beer_lm <- lm(formula = beers.review_overall ~ PC1 * PC2 * PC3 * PC4, data = beer_pc)
beer_lm
##
## Call:
## lm(formula = beers.review_overall ~ PC1 * PC2 * PC3 * PC4, data = beer_pc)
##
## Coefficients:
## (Intercept) PC1 PC2 PC3
## 3.744140 0.078395 -0.045013 0.038374
## PC4 PC1:PC2 PC1:PC3 PC2:PC3
## -0.003715 0.026857 -0.011422 -0.017470
## PC1:PC4 PC2:PC4 PC3:PC4 PC1:PC2:PC3
## 0.007873 0.015187 -0.003995 0.004813
## PC1:PC2:PC4 PC1:PC3:PC4 PC2:PC3:PC4 PC1:PC2:PC3:PC4
## -0.002824 -0.004335 0.006443 -0.003030
beer_loess <- loess(formula = beers.review_overall ~ PC1 * PC2 * PC3 * PC4, data = beer_pc, degree = 2, span = 1, family = "symmetric", normalize = TRUE)
beer_loess
## Call:
## loess(formula = beers.review_overall ~ PC1 * PC2 * PC3 * PC4,
## data = beer_pc, span = 1, degree = 2, normalize = TRUE, family = "symmetric")
##
## Number of Observations: 3197
## Equivalent Number of Parameters: 16.39
## Residual Scale Estimate: 0.327
beer_vals <- data.frame(
expand.grid(
PC1 = runif(10, min = -118.9999, max = 216.5028),
PC2 = runif(10, min = -225.2387, max = 80.41939),
PC3 = runif(10, min = -141.4299, max = 111.0616),
PC4 = runif(10, min = -91.86107, max = 124.6238)
)
)
beer_predlm <- as.data.frame(augment(beer_lm, newdata = beer_vals))
beer_predloess <- as.data.frame(augment(beer_loess, newdata = beer_vals))
lm_preds <- augment(beer_lm, type.predict = "response")
lm_preds
loess_preds <- augment(beer_loess, type.predict = "response")
loess_preds
ggplot(data = lm_preds, aes(x = beers.review_overall, y = .resid)) +
geom_point(aes(color = .fitted), alpha = 0.4) +
scale_color_gradientn(colors = c("#F26744", "#ADE330", "#58ED98", "#95ADF5", "#F595CF")) +
theme_bw()
ggplot(data = lm_preds, aes(x = log(beers.review_overall), y = log(.fitted))) +
geom_point(aes(color = .resid), alpha = 0.4) +
scale_color_gradientn(colors = c("#F26744", "#ADE330", "#58ED98", "#95ADF5", "#F595CF")) +
theme_bw()
ggplot(data = lm_preds, aes(x = .fitted, y = .resid)) +
geom_point(aes(color = beers.review_overall), alpha = 0.4) +
scale_color_gradientn(colors = c("#F26744", "#ADE330", "#58ED98", "#95ADF5", "#F595CF")) +
theme_bw()
ggplot(data = loess_preds, aes(x = beers.review_overall, y = .resid)) +
geom_point(aes(color = .fitted), alpha = 0.4) +
scale_color_gradientn(colors = c("#F26744", "#ADE330", "#58ED98", "#95ADF5", "#F595CF")) +
theme_bw()
ggplot(data = loess_preds, aes(x = beers.review_overall, y = .fitted)) +
geom_point(aes(color = .resid), alpha = 0.4) +
scale_color_gradientn(colors = c("#F26744", "#ADE330", "#58ED98", "#95ADF5", "#F595CF")) +
theme_bw()
ggplot(data = loess_preds, aes(x = .fitted, y = .resid)) +
geom_point(aes(color = beers.review_overall), alpha = 0.4) +
scale_color_gradientn(colors = c("#F26744", "#ADE330", "#58ED98", "#95ADF5", "#F595CF")) +
theme_bw()
In conclusion, we were able to analyze the questions we put forward in the objectives. Our study did not identify any prominent factors that were significantly more responsible for predicting reviews than others, Both the linear model using PCA components and the model utilizing selected parameters yielded similar fitted and residual plots. However, we observed a linear pattern in the residuals, indicating the presence of heteroscedasticity, which is a concern for linear regression models.
As a limitation, the presence of heteroscedasticity suggests that the variances of the errors are not constant across all levels of the independent variables, potentially affecting the model’s reliability. For future work, we recommend investigating and addressing the heteroscedasticity issue by employing techniques such as data transformations or weighted least squares regression. Additionally, exploring alternative modeling techniques and incorporating other data sources, such as user demographics or beer production methods, may provide a more comprehensive understanding of the factors driving consumer preferences in the beer industry.