TidyTuesday anlyses from David Robinson

Coffee ratings

# libraries
library(tidytuesdayR)
library(tidyverse)

# set plot theme
theme_set(theme_light())

# load data
coffee <- tt_load("2020-07-07")

## 
##  Downloading file 1 of 1: `coffee_ratings.csv`

coffee_ratings <- coffee$coffee_ratings %>%
  mutate(coffee_id = row_number()) %>%
  filter(total_cup_points > 0)

coffee_ratings %>%
  count(species, sort = TRUE) # sort

## # A tibble: 2 x 2
##   species     n
##   <chr>   <int>
## 1 Arabica  1310
## 2 Robusta    28

# focus on variety, filter out any of it is missing
coffee_lumped <- coffee_ratings %>%
  filter(!is.na(variety)) %>%
  mutate(variety = fct_lump(variety, 12), # select the top 12 factors
         sort = TRUE)


coffee_lumped %>%
  mutate(
    variety = fct_reorder(
      variety, total_cup_points
      ) # reorder factors in relation to the total cup points
    ) %>% 
  ggplot(aes(total_cup_points, variety)) +
    geom_boxplot()

coffee_lumped %>%
  ggplot(aes(total_cup_points, fill = variety)) +
    geom_histogram(binwidth = 1) +
    facet_wrap(~ variety, scale = "free_y") + # adapt y axis to each variety
    theme(legend.position = "none")

coffee_ratings %>%
  summarize(
    across(
      everything(), ~ mean(!is.na(.))
      )
    ) %>% 
  gather()

## # A tibble: 44 x 2
##    key               value
##    <chr>             <dbl>
##  1 total_cup_points  1    
##  2 species           1    
##  3 owner             0.995
##  4 country_of_origin 0.999
##  5 farm_name         0.732
##  6 lot_number        0.206
##  7 mill              0.765
##  8 ico_number        0.887
##  9 company           0.844
## 10 altitude          0.831
## # … with 34 more rows

coffee_ratings %>%
  count(producer, sort = TRUE)

## # A tibble: 692 x 2
##    producer                         n
##    <chr>                        <int>
##  1 <NA>                           231
##  2 La Plata                        30
##  3 Ipanema Agrícola SA             22
##  4 Doi Tung Development Project    17
##  5 Ipanema Agricola                12
##  6 VARIOS                          12
##  7 Ipanema Agricola S.A            11
##  8 ROBERTO MONTERROSO              10
##  9 AMILCAR LAPOLA                   9
## 10 LA PLATA                         9
## # … with 682 more rows

coffee_ratings %>%
  count(company, sort = TRUE)

## # A tibble: 282 x 2
##    company                              n
##    <chr>                            <int>
##  1 <NA>                               209
##  2 unex guatemala, s.a.                86
##  3 ipanema coffees                     50
##  4 exportadora de cafe condor s.a      40
##  5 kona pacific farmers cooperative    40
##  6 racafe & cia s.c.a                  40
##  7 blossom valley宸嶧國際              25
##  8 carcafe ltda                        25
##  9 nucoffee                            24
## 10 taiwan coffee laboratory            20
## # … with 272 more rows

coffee_ratings %>%
  count(color, sort = TRUE)

## # A tibble: 5 x 2
##   color            n
##   <chr>        <int>
## 1 Green          869
## 2 <NA>           218
## 3 Bluish-Green   114
## 4 Blue-Green      85
## 5 None            52

coffee_ratings %>%
  count(country = fct_lump(country_of_origin, 12), sort = TRUE) %>%
  filter(!is.na(country)) %>%
  mutate(country = fct_reorder(country, n)) %>%
  ggplot(aes(n, country)) +
  geom_col()

coffee_ratings %>%
  filter(!is.na(country_of_origin)) %>%
  mutate(country = fct_lump(country_of_origin, 12),
         country = fct_reorder(country, total_cup_points)) %>%
  ggplot(aes(total_cup_points, country)) +
  geom_boxplot()

Interesting dimensions:

Country
Variety
Company??

library(ggridges)
coffee_metrics <- coffee_ratings %>%
  select(coffee_id, total_cup_points, variety, company,
         country_of_origin,
         altitude_mean_meters,
         aroma:moisture) %>%
  pivot_longer(aroma:cupper_points, names_to = "metric", values_to = "value")
coffee_metrics %>%
  mutate(metric = fct_reorder(metric, value)) %>%
  ggplot(aes(value, metric)) +
  geom_density_ridges() # geom_density for categorical variables in a dataset

coffee_metrics %>%
  group_by(metric) %>%
  summarize(average = mean(value),
            sd = sd(value)) %>%
  arrange(desc(average))

## # A tibble: 10 x 3
##    metric        average    sd
##    <chr>           <dbl> <dbl>
##  1 sweetness        9.86 0.554
##  2 clean_cup        9.84 0.715
##  3 uniformity       9.84 0.485
##  4 aroma            7.57 0.316
##  5 acidity          7.54 0.319
##  6 flavor           7.53 0.341
##  7 balance          7.52 0.354
##  8 body             7.52 0.308
##  9 cupper_points    7.51 0.427
## 10 aftertaste       7.41 0.350

library(widyr) # finds correlation into a messy dataset
library(ggraph) # show network plot to find out clusters
library(igraph)
library(tidytext)
correlations <- coffee_metrics %>% 
  # do not understand that step
  pairwise_cor(item = metric, feature = coffee_id, value = value, sort = TRUE)


correlations %>%
  head(50) %>%
  # show clusters between correlated variables
  graph_from_data_frame() %>% 
  ggraph() +
    geom_edge_link(aes(edge_alpha = correlation)) +
    geom_node_point() +
    geom_node_text(aes(label = name), repel = TRUE)

# time to apply a pca
coffee_metrics %>%
  filter(!metric %in% c("sweetness", "clean_cup", "uniformity")) %>%
  group_by(metric) %>%
  mutate(centered = value - mean(value)) %>% # center the values
  ungroup() %>%
  widely_svd(item = metric, feature = coffee_id, value = value) %>%
  filter(between(dimension, 2, 5)) %>%
  mutate(metric = reorder_within(metric, value, dimension)) %>%
  ggplot(aes(value, metric)) +
    geom_col() +
    scale_y_reordered() +
    facet_wrap(~ dimension, scales = "free_y")

coffee_ratings %>%
  filter(altitude_mean_meters < 10000,
         altitude != 1) %>%
  mutate(altitude_mean_meters = pmin(altitude_mean_meters, 3000)) %>%
  ggplot(aes(altitude_mean_meters, total_cup_points)) +
    geom_point() +
    geom_smooth(method = "lm")

coffee_metrics %>%
  filter(altitude_mean_meters < 10000) %>%
  mutate(altitude_mean_meters = pmin(altitude_mean_meters, 3000)) %>%
  mutate(km = altitude_mean_meters / 1000) %>%
  group_by(metric) %>%
  summarize(correlation = cor(altitude_mean_meters, value),
            model = list(lm(value ~ km))) %>%
  mutate(tidied = map(model, broom::tidy, conf.int = TRUE)) %>%
  unnest(tidied) %>%
  filter(term == "km") %>%
  ungroup() %>%
  mutate(metric = fct_reorder(metric, estimate)) %>%
  ggplot(aes(estimate, metric, color = p.value < .05)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = .1) +
  labs(y = "Evaluation of coffee",
       x = "Each kilometer of altitude contributes this much to score (95% confidence interval)")

Tour de France

# libraries
library(tidytuesdayR)
library(tidyverse)
library(lubridate)
# set plot theme
theme_set(theme_light())

tuesdata <- tidytuesdayR::tt_load('2020-04-07')

## 
##  Downloading file 1 of 3: `stage_data.csv`
##  Downloading file 2 of 3: `tdf_stages.csv`
##  Downloading file 3 of 3: `tdf_winners.csv`

# look at the columns on the dataset
# let's analyse the birth winners
tdf_winners <- tuesdata$tdf_winners %>% 
  mutate(year = year(start_date), 
         speed = distance / time_overall)

stage_data <- tuesdata$stage_data
tdf_stages <- tuesdata$tdf_stages %>% 
  janitor::clean_names() %>% # put all the titles lower
  mutate(year = year(date))

# exploring winners data
tdf_winners %>% 
  count(birth_country, sort = TRUE) %>% 
  mutate(birth_country = fct_reorder(birth_country, n)) %>% 
  ggplot(aes(y = birth_country, x = n)) + 
    geom_col() + 
    labs(y = "", 
         title = "What the counties are the most Tour the France winners born in?")

tdf_winners %>% 
  count(winner_name, birth_country, sort = TRUE)

## # A tibble: 63 x 3
##    winner_name      birth_country     n
##    <chr>            <chr>         <int>
##  1 Lance Armstrong  USA               7
##  2 Bernard Hinault  France            5
##  3 Eddy Merckx      Belgium           5
##  4 Jacques Anquetil France            5
##  5 Miguel Induráin  Spain             5
##  6 Chris Froome     Kenya             4
##  7 Greg LeMond      USA               3
##  8 Louison Bobet    France            3
##  9 Philippe Thys    Belgium           3
## 10 Alberto Contador Spain             2
## # … with 53 more rows

What has change over time? This question leads to how is the age disitributed over time?

tdf_winners %>% 
  group_by(decade = 10 * (year %/% 10)) %>% # truncated division to get the decade
  summarise(winner_age = mean(age), 
            winner_height = mean(height, na.rm = TRUE), 
            winner_weight = mean(weight, na.rm = TRUE)) %>% 
  ggplot(aes(x = decade, y = winner_age)) + 
    geom_line() + 
    expand_limits(y = 0)

There is no clear relation with age, weight or height over the year in the winners.

Now he is looking at the time margin:

library(lubridate)
by_decade <- tdf_winners %>% 
  group_by(decade = 10 * (year %/% 10)) %>% # truncated division to get the decade
  summarise(winner_age = mean(age), 
            winner_height = mean(height, na.rm = TRUE), 
            winner_weight = mean(weight, na.rm = TRUE), 
            winner_margin = mean(time_margin, na.rm = TRUE), 
            winner_speed = mean(speed, na.rm = TRUE))

by_decade %>% 
  filter(decade >= 1910) %>% # there is only two data points in 1910 
  ggplot(aes(x = decade, y = winner_margin * 60)) + # transform to minutes
    geom_line() + 
    expand_limits(y = 0) + 
    labs(x = "Decade", 
         y = "Average margin of winner (minutes)", 
         title = "Tour the France has been getting closer")

This chart is meaningful.

NOw let’s look at the time of the winner:

by_decade %>%
  ggplot(aes(x = decade, y = winner_speed)) +
    geom_line() + 
    expand_limits(y = 0) + 
    labs(x = "Decade", 
         y = "Average speed of the winner (km/h)", 
         title = "Tour the France winners have been getting faster")

They have started to flatter on the paste year, possible doping?. TO know more abou it, I would be looking for some variable that affects the speed, like elevation. But they are on present on the dataset. So now we know with the winners dataset they are getting faster and there are getting closer (less margin time).

Other things, average life expectancy of the winners. To do that, we will do a survivial analyses:

library(survival) # very good for medical proposals
library(broom)

surv_model <- tdf_winners %>% 
  distinct(winner_name, .keep_all = TRUE) %>% 
  transmute(winner_name,
            birth_year = year(born), 
            death_year = year(died), 
            dead = as.integer(!is.na(death_year))) %>% 
  mutate(age_at_death = coalesce(death_year, 2020) - birth_year) %>% # They are not death, we are just saying they are alive in 2020
  survival::survfit(Surv(age_at_death, dead) ~ 1, data = .) 

glance(surv_model)

## # A tibble: 1 x 10
##   records n.max n.start events rmean rmean.std.error median conf.low conf.high
##     <dbl> <dbl>   <dbl>  <dbl> <dbl>           <dbl>  <dbl>    <dbl>     <dbl>
## 1      63    63      63     38  69.6            2.69     77       71        82
## # … with 1 more variable: nobs <int>

Median life expectancy of a Tour of France winner is 77.

Now let’s start some exploration on the stage data:

# I want to join stage data, but I need to prepare them before doing it 
stages_joined <- stage_data %>% 
  extract(col = stage_results_id,
          into = "stage",
          regex = "stage-(.*)") %>%  # extract anything after "stage"
  inner_join(tdf_stages, by = c("year", "stage")) %>%  # I've lost a little bit of data
  mutate(rank = as.integer(rank)) %>% # NA introduced cause of people not finishing the race
 # add_count(year, stage, name = "competitors") # adding competitors by year and stage
  # now I want to get the number of finishers 
  group_by(year, stage) %>% 
  mutate(finishers = sum(!is.na(rank))) %>% 
  ungroup() %>% 
  mutate(percentile = 1 - rank / finishers)

stages_joined %>% 
  filter(stage == "1") %>% 
  group_by(winner_country) %>% 
  summarise(stages = n(), 
            median_percentile = median(percentile, na.rm = TRUE)) %>% 
  arrange(desc(stages))

## # A tibble: 18 x 3
##    winner_country stages median_percentile
##    <chr>           <int>             <dbl>
##  1 FRA              3215             0.494
##  2 BEL              2760             0.496
##  3 ITA              1637             0.497
##  4 SUI               800             0.5  
##  5 GER               680             0.495
##  6 NED               655             0.498
##  7 GBR               576             0.497
##  8 AUS               387             0.497
##  9 LUX               376             0.496
## 10 EST               368             0.497
## 11 FRG               280             0.496
## 12 CAN               198             0.497
## 13 POR               198             0.497
## 14 SVK               198             0.497
## 15 URS               198             0.497
## 16 USA               189             0.497
## 17 UZB               189             0.497
## 18 ESP               180             0.514

This table tell us that no country does it better than the average at the first stage.

How can I find the relation between the results at the first stage and how is affecting at the final results? Does the winner of the first stage predict their final point ranking?

total_points <- stages_joined %>% 
  group_by(year, rider) %>% 
  summarise(total_points = sum(points, na.rm = TRUE)) %>% 
  mutate(points_rank = percent_rank(total_points)) %>% 
  ungroup()

stages_joined %>% 
  filter(stage == "1") %>% 
  inner_join(total_points, by = c("year", "rider")) %>% 
  select(year, rider, percentile_first_stage = percentile, 
         points_rank) %>% 
  mutate(first_stage_bin = cut(percentile_first_stage, seq(0, 1, .1))) %>% 
  filter(!is.na(first_stage_bin)) %>% 
  ggplot(aes(y = first_stage_bin, x = points_rank)) + 
    geom_boxplot() + 
    labs(y = "Decile performance in the first stage", 
         x = "Overall points percentile")

So yes, there is a relation between your results at the first stage and the final results.

Now we are going to use a gganimate to show a race:

library(gganimate)

top_10_2017 <- total_points %>% 
  filter(year == max(year)) %>% 
  top_n(10, total_points)

stages_joined %>% 
  filter(year == max(year)) %>% 
  semi_join(top_10_2017, by = "rider") %>% # now I have all the data for the top 10 riders accross all the time
  mutate(stage = as.integer(stage), 
         points = coalesce(points, 0)) %>% 
  group_by(rider) %>% 
  mutate(cumulative_points = cumsum(points)) %>% 
  ungroup() %>% 
  ggplot(aes(cumulative_points, rider, fill = cumulative_points)) + 
    geom_col() + 
    transition_time(stage) + 
    theme(legend.position = "none") + 
    labs(title = "The 2018 Tour France: Stage: {frame_time}", 
         x = "Cumulative points at this stage", 
         y = "")

The Office

library(tidyverse)
library(tidytuesdayR)
library(schrute)

# set plot theme
theme_set(theme_light())


office_data <- tt_load("2020-03-17")

## 
##  Downloading file 1 of 1: `office_ratings.csv`

office_ratings <- office_data$office_ratings  %>%
  mutate(name = str_to_lower(str_remove_all(title, "\\.| \\(Part.*|\\: Part.*")))

office_transcripts <- as_tibble(theoffice) %>% 
  mutate(season = as.integer(season),
         episode = as.integer(episode)) %>%
  mutate(character = str_remove_all(character, '"')) %>%
  mutate(name = str_to_lower(str_remove_all(episode_name, "\\.| \\(Part.*")))

library(ggrepel)
office_ratings %>%
  group_by(season) %>%
  summarize(avg_rating = mean(imdb_rating)) %>%
  ggplot(aes(season, avg_rating)) +
    geom_line() +
    scale_x_continuous(breaks = 1:9)

office_ratings %>%
  mutate(title = fct_inorder(title),
         episode_number = row_number()) %>%
  ggplot(aes(episode_number, imdb_rating)) +
    geom_line() +
    geom_smooth() +
    geom_point(aes(color = factor(season), size = total_votes)) +
    geom_text(aes(label = title), check_overlap = TRUE, hjust = 1) +
    expand_limits(x = -10) +
    theme(panel.grid.major.x = element_blank(),
          legend.position = "none") +
    labs(x = "Episode number",
         y = "IMDB Rating",
         title = "Popularity of The Office episodes over time",
         subtitle = "Color represents season, size represents # of ratings")

office_ratings %>%
  arrange(desc(imdb_rating)) %>%
  mutate(title = paste0(season, ".", episode, " ", title),
         title = fct_reorder(title, imdb_rating)) %>%
  head(20) %>%
  ggplot(aes(title, imdb_rating, color = factor(season), size = total_votes)) +
  geom_point() +
  coord_flip() +
  labs(color = "Season",
       title = "Most popular episodes of The Office")

Transcripts

library(tidytext)
blacklist <- c("yeah", "hey", "uh", "gonna")
blacklist_characters <- c("Everyone", "All", "Both", "Guy", "Girl", "Group")
transcript_words <- office_transcripts %>%
  group_by(character) %>%
  filter(n() >= 100,
         n_distinct(episode_name) > 2) %>%
  ungroup() %>%
  select(-text_w_direction) %>%
  unnest_tokens(word, text) %>% 
  anti_join(stop_words, by = "word") %>%
  filter(!word %in% blacklist,
         !character %in% blacklist_characters)

character_tf_idf <- transcript_words %>%
  add_count(word) %>%
  filter(n >= 20) %>%
  count(word, character) %>%
  # which words are more specific to each character
  bind_tf_idf(word, character, n) %>%
  arrange(desc(tf_idf))

character_tf_idf %>%
  filter(character %in% c("Dwight", "Jim", "David Wallace", "Darryl", "Jan", "Holly")) %>%
  group_by(character) %>%
  top_n(10, tf_idf) %>%
  ungroup() %>%
  mutate(word = reorder_within(word, tf_idf, character)) %>%
  ggplot(aes(word, tf_idf)) +
    geom_col() +
    coord_flip() +
    scale_x_reordered() +
    facet_wrap(~ character, scales = "free_y") +
    labs(x = "",
         y = "TF-IDF of character-word pairs")

office_transcripts %>%
  count(character, sort = TRUE) %>%
  filter(character == "Dwight")

## # A tibble: 1 x 2
##   character     n
##   <chr>     <int>
## 1 Dwight     6847

Machine learning model

What affects popularity of an episode:

Season/time
Director
Writer
Lines per character

ratings_summarized <- office_ratings %>%
  group_by(name) %>%
  summarize(imdb_rating = mean(imdb_rating))
character_lines_ratings <- office_transcripts %>%
  filter(!character %in% blacklist_characters) %>%
  count(character, name) %>%
  group_by(character) %>%
  filter(sum(n) >= 50,
         n() >= 5) %>%
  inner_join(ratings_summarized, by = "name")
character_lines_ratings %>%
  summarize(avg_rating = mean(imdb_rating),
            nb_episodes = n()) %>%
  arrange(desc(avg_rating))

## # A tibble: 39 x 3
##    character     avg_rating nb_episodes
##    <chr>              <dbl>       <int>
##  1 Carol               8.7            6
##  2 Charles             8.62           6
##  3 Karen               8.6           25
##  4 Holly               8.57          16
##  5 Jan                 8.48          37
##  6 Michael             8.42         130
##  7 David Wallace       8.42          16
##  8 David               8.38          27
##  9 Roy                 8.36          28
## 10 Josh                8.34           7
## # … with 29 more rows

director_writer_features <- office_transcripts %>%
  distinct(name, director, writer) %>%
  gather(type, value, director, writer) %>%
  separate_rows(value, sep = ";") %>%
  unite(feature, type, value, sep = ": ") %>%
  group_by(feature) %>%
  filter(n() >= 3) %>%
  mutate(value = 1) %>%
  ungroup()

character_line_features <- character_lines_ratings %>%
  ungroup() %>%
  transmute(name, feature = character, value = log2(n))
season_features = office_ratings %>%
  distinct(name, season) %>%
  transmute(name, feature = paste("season:", season), value = 1)
features <- bind_rows(director_writer_features,
                      character_line_features,
                      season_features) %>%
  semi_join(office_ratings, by = "name") %>%
  semi_join(office_transcripts, by = "name")

episode_feature_matrix <- features %>%
  cast_sparse(name, feature, value)
ratings <- ratings_summarized$imdb_rating[match(rownames(episode_feature_matrix), ratings_summarized$name)]

library(glmnet)
library(broom)
mod <- cv.glmnet(episode_feature_matrix, ratings)
plot(mod)

tidy(mod$glmnet.fit) %>%
  filter(lambda == mod$lambda.min,
         term != "(Intercept)") %>%
  mutate(term = fct_reorder(term, estimate)) %>%
  ggplot(aes(term, estimate, fill = estimate > 0)) +
  geom_col() +
  coord_flip() +
  labs(y = "Estimated effect on the rating of an episode") +
  theme(legend.position = "none")

Wine ratings

library(tidyverse)
library(tidytuesdayR)

theme_set(theme_light())

# load and cleaning step
wine_ratings <- tt_load("2019-05-28")$`winemag-data-130k-v2` %>%
  select(-X1) %>%
  # extract 4 digitis which starts with 20, in a new column called year, 
  extract(title, "year", "(20\\d\\d)", convert = TRUE, remove = FALSE) %>%
  mutate(year = ifelse(year < 1900, NA, year)) %>%
  filter(!is.na(price))

## 
##  Downloading file 1 of 1: `winemag-data-130k-v2.csv`

wine_ratings %>%
  count(country, sort = T)

## # A tibble: 43 x 2
##    country       n
##    <chr>     <int>
##  1 US        54265
##  2 France    17776
##  3 Italy     16914
##  4 Spain      6573
##  5 Portugal   4875
##  6 Chile      4416
##  7 Argentina  3756
##  8 Austria    2799
##  9 Australia  2294
## 10 Germany    2120
## # … with 33 more rows

wine_ratings %>%
  count(designation, sort = T)

## # A tibble: 35,777 x 2
##    designation        n
##    <chr>          <int>
##  1 <NA>           34779
##  2 Reserve         1980
##  3 Estate          1318
##  4 Reserva         1219
##  5 Estate Grown     618
##  6 Riserva          607
##  7 Brut             472
##  8 Dry              406
##  9 Estate Bottled   342
## 10 Crianza          338
## # … with 35,767 more rows

wine_ratings %>%
  count(country, region_1, sort = TRUE)

## # A tibble: 1,247 x 3
##    country   region_1                 n
##    <chr>     <chr>                <int>
##  1 Portugal  <NA>                  4875
##  2 US        Napa Valley           4475
##  3 Chile     <NA>                  4416
##  4 US        Columbia Valley (WA)  4109
##  5 US        Russian River Valley  3090
##  6 Austria   <NA>                  2799
##  7 US        California            2627
##  8 US        Paso Robles           2327
##  9 US        Willamette Valley     2296
## 10 Argentina Mendoza               2275
## # … with 1,237 more rows

wine_ratings %>%
  count(taster_name, sort = TRUE)

## # A tibble: 20 x 2
##    taster_name            n
##    <chr>              <int>
##  1 <NA>               24496
##  2 Roger Voss         20172
##  3 Michael Schachner  14951
##  4 Kerin O’Keefe       9874
##  5 Virginie Boone      9507
##  6 Paul Gregutt        9498
##  7 Matt Kettmann       6237
##  8 Joe Czerwinski      5012
##  9 Sean P. Sullivan    4925
## 10 Anna Lee C. Iijima  4369
## 11 Jim Gordon          4171
## 12 Anne Krebiehl MW    3398
## 13 Lauren Buzzeo       1713
## 14 Susan Kostrzewa     1073
## 15 Mike DeSimone        504
## 16 Jeff Jenssen         491
## 17 Alexander Peartree   413
## 18 Carrie Dykes         138
## 19 Fiona Adams           27
## 20 Christina Pickard      6

wine_ratings %>%
  filter(!is.na(designation)) %>%
  count(variety, designation, sort = TRUE)

## # A tibble: 44,805 x 3
##    variety            designation     n
##    <chr>              <chr>       <int>
##  1 Cabernet Sauvignon Reserve       470
##  2 Chardonnay         Reserve       324
##  3 Pinot Noir         Reserve       279
##  4 Riesling           Dry           231
##  5 Sangiovese         Riserva       229
##  6 Sparkling Blend    Brut          224
##  7 Pinot Noir         Estate        208
##  8 Tempranillo        Crianza       190
##  9 Cabernet Sauvignon Estate        187
## 10 Red Blend          Riserva       182
## # … with 44,795 more rows

wine_ratings %>%
  ggplot(aes(year)) +
  geom_histogram()

wine_ratings %>%
  ggplot(aes(points)) +
  geom_histogram(binwidth = 1)

wine_ratings %>%
  ggplot(aes(price)) +
  geom_histogram() +
  # us a log scale to see if iti is a normal distribution
  scale_x_log10()

ggplot(wine_ratings, aes(price, points)) +
  geom_point(alpha = .1) +
  geom_smooth(method = "lm") +
  scale_x_log10()

# predict the points by the price of a wine
summary(lm(points ~ log2(price), wine_ratings))

## 
## Call:
## lm(formula = points ~ log2(price), data = wine_ratings)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.0559  -1.5136   0.1294   1.7133   9.2408 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 78.981419   0.035765    2208   <2e-16 ***
## log2(price)  1.974162   0.007338     269   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.408 on 120973 degrees of freedom
## Multiple R-squared:  0.3744, Adjusted R-squared:  0.3744 
## F-statistic: 7.239e+04 on 1 and 120973 DF,  p-value: < 2.2e-16

Every time the price doubles, the expected number of points goes up by 2.

library(broom)
model <- wine_ratings %>%
  replace_na(list(taster_name = "Missing", country = "Missing")) %>%
  mutate(country = fct_relevel(fct_lump(country, 7), "US"),
         taster_name = fct_relevel(fct_lump(taster_name, 6), "Missing")) %>%
  lm(points ~ log2(price) + country + year + taster_name, data = .)

model %>%
  tidy(conf.int = TRUE) %>%
  filter(term != "(Intercept)") %>%
  mutate(term = str_replace(term, "country", "Country: "),
         term = str_replace(term, "taster_name", "Taster: "),
         term = fct_reorder(term, estimate)) %>%
  ggplot(aes(estimate, term)) +
    geom_point() +
    geom_errorbarh(aes(xmin = conf.low, xmax = conf.high))

# model %>%
#   augment(data = wine_ratings) %>%
#   ggplot(aes(.fitted, points)) +
#     geom_point(alpha = .1)
# tidy(anova(model)) %>%
  # mutate(sumsq / sum(sumsq))

Lasso regression on words in description

library(tidytext)
wine_rating_words <- wine_ratings %>%
  mutate(wine_id = row_number()) %>%
  unnest_tokens(word, description) %>%
  anti_join(stop_words, by = "word") %>%
  filter(!word %in% c("wine", "drink"),
         str_detect(word, "[a-z]"))
wine_rating_words %>%
  count(word, sort = TRUE) %>%
  head(20) %>%
  mutate(word = fct_reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  coord_flip()

library(widyr)
wine_words_filtered <- wine_rating_words %>%
  distinct(wine_id, word) %>%
  add_count(word) %>%
  filter(n >= 100)
wine_words_filtered %>%
  pairwise_cor(word, wine_id, sort = TRUE)

## # A tibble: 5,501,370 x 3
##    item1    item2    correlation
##    <chr>    <chr>          <dbl>
##  1 verde    vinho          0.985
##  2 vinho    verde          0.985
##  3 verdot   petit          0.969
##  4 petit    verdot         0.969
##  5 nacional touriga        0.956
##  6 touriga  nacional       0.956
##  7 sirah    petite         0.956
##  8 petite   sirah          0.956
##  9 smith    granny         0.921
## 10 granny   smith          0.921
## # … with 5,501,360 more rows

library(Matrix)
wine_word_matrix <- wine_words_filtered %>%
  cast_sparse(wine_id, word)
wine_ids <- as.integer(rownames(wine_word_matrix))
scores <- wine_ratings$points[wine_ids]
library(glmnet)
wine_word_matrix_extra <- cbind(wine_word_matrix, log_price = log2(wine_ratings$price[wine_ids]))
library(doMC)
registerDoMC(cores = 4)
cv_glmnet_model <- cv.glmnet(wine_word_matrix_extra, scores, parallel = TRUE)
plot(cv_glmnet_model)

lexicon <- cv_glmnet_model$glmnet.fit %>%
  tidy() %>%
  filter(lambda == cv_glmnet_model$lambda.1se,
         term != "(Intercept)",
         term != "log_price") %>%
  select(word = term, coefficient = estimate)
lexicon %>%
  arrange(coefficient) %>%
  group_by(direction = ifelse(coefficient < 0, "Negative", "Positive")) %>%
  top_n(16, abs(coefficient)) %>%
  ungroup() %>%
  mutate(word = fct_reorder(word, coefficient)) %>%
  ggplot(aes(word, coefficient, fill = direction)) +
  geom_col() +
  coord_flip() +
  labs(x = "",
       y = "Estimated effect of the word on the score",
       title = "What words are predictive of a wine's score?")

wine_ratings %>%
  mutate(wine_id = row_number()) %>%
  arrange(points) %>%
  head(1) %>%
  select(wine_id, description) %>%
  pull(description)

## [1] "Aromas of pumpkin, squash and corn chips are stale and not inviting. There's an acceptable mouthfeel to this weird, unbalanced Chardonnay along with flavors of spiced squash, mealy apple and sautéed root vegetables."

wine_rating_words %>%
  filter(wine_id %in% sample(unique(wine_id), 6)) %>%
  distinct(word, title, points) %>%
  mutate(wine = paste0(str_trunc(title, 40), " (", points, ")")) %>%
  inner_join(lexicon, by = "word") %>%
  mutate(word = fct_reorder(word, coefficient)) %>%
  ggplot(aes(word, coefficient, fill = coefficient > 0)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~ wine, scales = "free_y") +
  labs(title = "How a lasso regression would predict each wine's score",
       subtitle = "Using a lasso regression with an extra term for price",
       x = "",
       y = "Effect on score")

What is glmnet?

cv_glmnet_model$glmnet.fit %>%
  tidy() %>%
  filter(term %in% c("rich", "black", "simple", "complex", "vineyard", "concentrated")) %>%
  ggplot(aes(lambda, estimate, color = term)) +
  geom_line() +
  scale_x_log10() +
  geom_hline(lty = 2, yintercept = 0)

cv_glmnet_model$glmnet.fit %>%
  tidy() %>%
  count(lambda) %>%
  ggplot(aes(lambda, n)) +
  geom_line() +
  scale_x_log10()

wine_ratings %>%
  mutate(country = fct_relevel(fct_lump(country, 7), "US")) %>%
  mutate(country = fct_reorder(country, points)) %>%
  ggplot(aes(country, points)) +
  geom_boxplot() +
  coord_flip()

wine_ratings %>%
  group_by(year) %>%
  summarize(average_points = mean(points), n())

## # A tibble: 19 x 3
##     year average_points `n()`
##    <int>          <dbl> <int>
##  1  2000           87.3   735
##  2  2001           87.4   668
##  3  2002           87.6   333
##  4  2003           88.4   499
##  5  2004           88.7  1604
##  6  2005           88.2  3293
##  7  2006           88.1  5170
##  8  2007           88.1  6498
##  9  2008           88.1  6725
## 10  2009           88.3  9056
## 11  2010           88.2 11105
## 12  2011           88.2 11437
## 13  2012           88.8 14742
## 14  2013           89.0 15196
## 15  2014           88.9 14892
## 16  2015           88.5  9642
## 17  2016           87.7  3549
## 18  2017           85.5    11
## 19    NA           87.6  5820

wine_ratings %>%
  mutate(reviewer = fct_reorder(fct_lump(taster_name, 10), points)) %>%
  ggplot(aes(reviewer, points)) +
  geom_boxplot() +
  coord_flip()