Priyanka Gagneja
01/17/2020
Reviews are measured on 11-point (Likert) scale varying between 0 - 5 with focus on aspects such as:
5840 Breweries
66K Beers
104 Beer Styles
library(DataExplorer)
introduce(beer)
rows columns discrete_columns continuous_columns all_missing_columns
1 1586614 13 4 9 0
total_missing_values complete_rows total_observations memory_usage
1 67785 1518829 20625982 128171432
Note : Missingness of data does not seem to be any issue with this data. All the fields are complete except for
beer_abvfield that has 67K NAs (4.2%).
library(inspectdf)
inspect_na(beer)
# A tibble: 13 x 3
col_name cnt pcnt
<chr> <int> <dbl>
1 beer_abv 67785 4.27
2 brewery_id 0 0
3 brewery_name 0 0
4 review_time 0 0
5 review_overall 0 0
6 review_aroma 0 0
7 review_appearance 0 0
8 review_profilename 0 0
9 beer_style 0 0
10 review_palate 0 0
11 review_taste 0 0
12 beer_name 0 0
13 beer_beerid 0 0
inspect_types(beer)
# A tibble: 3 x 4
type cnt pcnt col_name
<chr> <int> <dbl> <list>
1 numeric 6 46.2 <chr [6]>
2 factor 4 30.8 <chr [4]>
3 integer 3 23.1 <chr [3]>
inspect_num(beer)
# A tibble: 9 x 10
col_name min q1 median mean q3 max sd pcnt_na hist
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <nam>
1 brewery… 1.00e+0 1.43e2 4.29e2 3.13e3 2.37e3 2.80e4 5.58e+3 0 <tib…
2 review_… 8.41e+8 1.17e9 1.24e9 1.22e9 1.29e9 1.33e9 7.65e+7 0 <tib…
3 review_… 0. 3.50e0 4.00e0 3.82e0 4.50e0 5.00e0 7.21e-1 0 <tib…
4 review_… 1.00e+0 3.50e0 4.00e0 3.74e0 4.00e0 5.00e0 6.98e-1 0 <tib…
5 review_… 0. 3.50e0 4.00e0 3.84e0 4.00e0 5.00e0 6.16e-1 0 <tib…
6 review_… 1.00e+0 3.50e0 4.00e0 3.74e0 4.00e0 5.00e0 6.82e-1 0 <tib…
7 review_… 1.00e+0 3.50e0 4.00e0 3.79e0 4.50e0 5.00e0 7.32e-1 0 <tib…
8 beer_abv 1.00e-2 5.20e0 6.50e0 7.04e0 8.50e0 5.77e1 2.32e+0 4.27 <tib…
9 beer_be… 3.00e+0 1.72e3 1.39e4 2.17e4 3.94e4 7.73e4 2.18e+4 0 <tib…
show_plot(inspect_num(beer))
#show_plot(inspect_cat(beer))
inspect_cat(beer)
# A tibble: 4 x 5
col_name cnt common common_pcnt levels
<chr> <int> <chr> <dbl> <named list>
1 beer_name 56857 90 Minute IPA 0.207 <tibble [56,85…
2 beer_style 104 American IPA 7.41 <tibble [104 ×…
3 brewery_name 5743 Boston Beer Company (S… 2.49 <tibble [5,743…
4 review_profile… 33388 northyorksammy 0.367 <tibble [33,38…
Note: _name & _id variables need some cleaning up.
# basic stats about each field
summary(beer)
# some more information with describe()
library(Hmisc)
describe(beer)
Context:
Challenge:
How beer_abv values vary?
Individual values
Brewery with a Beer with highest actual beer_abv
beer %>% arrange(desc(beer_abv)) %>%
select(brewery_name, beer_abv) %>%
top_n(1)
brewery_name beer_abv
1 Schorschbräu 57.7
Brewery with a Beer with highest average beer_abv
beer %>% select(brewery_name, beer_name, beer_abv) %>%
group_by(brewery_name) %>%
summarise(avg_abv = mean(beer_abv)) %>%
arrange(desc(avg_abv)) %>%
top_n(1)
# A tibble: 1 x 2
brewery_name avg_abv
<fct> <dbl>
1 Schorschbräu 19.2
beer %>% select(beer_name, review_overall) %>%
arrange(desc(review_overall)) %>%
unique() %>%
head()
beer_name review_overall
1 Caldera Ginger Beer 5
2 Rauch Ür Bock 5
5 Caldera Pale Ale 5
14 Pilot Rock Porter 5
15 Vas Deferens Ale 5
16 Old Growth Imperial Stout 5
beer %>% select(beer_name, review_overall) %>%
group_by(beer_name, review_overall) %>%
summarise(count = n()) %>%
arrange(desc(review_overall),desc(count)) %>%
top_n(1) %>%
head()
# A tibble: 6 x 3
# Groups: beer_name [6]
beer_name review_overall count
<fct> <dbl> <int>
1 Pliny The Elder 5 1067
2 Trappist Westvleteren 12 5 594
3 Pliny The Younger 5 293
4 Portsmouth Kate The Great 5 187
5 Live Oak HefeWeizen 5 137
6 Citra DIPA 5 111
Lets find out the relationship/correlations various review related variables
library(corrplot)
beer_mat <- as.matrix(beer[,c("review_overall", "review_aroma","review_palate","review_appearance","review_taste")])
corrplot(cor(beer_mat), method="number", type = "upper")
beer %>% select(beer_name, review_aroma, review_appearance, review_overall ) %>%
group_by(beer_name, review_aroma, review_appearance, review_overall) %>%
summarise(count = n()) %>%
arrange(desc(review_aroma), desc(review_appearance), desc(review_overall), desc(count)) %>%
top_n(1) %>%
head()
# A tibble: 6 x 5
# Groups: beer_name, review_aroma, review_appearance [6]
beer_name review_aroma review_appearan… review_overall count
<fct> <dbl> <dbl> <dbl> <int>
1 Pliny The Elder 5 5 5 214
2 Trappist Westvleteren… 5 5 5 159
3 Founders KBS (Kentuck… 5 5 5 148
4 The Abyss 5 5 5 126
5 Pliny The Younger 5 5 5 109
6 Founders Breakfast St… 5 5 5 105
Questions