# libraries
library(tidytuesdayR)
library(tidyverse)
# set plot theme
theme_set(theme_light())
# load data
coffee <- tt_load("2020-07-07")
##
## Downloading file 1 of 1: `coffee_ratings.csv`
coffee_ratings <- coffee$coffee_ratings %>%
mutate(coffee_id = row_number()) %>%
filter(total_cup_points > 0)
coffee_ratings %>%
count(species, sort = TRUE) # sort
## # A tibble: 2 x 2
## species n
## <chr> <int>
## 1 Arabica 1310
## 2 Robusta 28
# focus on variety, filter out any of it is missing
coffee_lumped <- coffee_ratings %>%
filter(!is.na(variety)) %>%
mutate(variety = fct_lump(variety, 12), # select the top 12 factors
sort = TRUE)
coffee_lumped %>%
mutate(
variety = fct_reorder(
variety, total_cup_points
) # reorder factors in relation to the total cup points
) %>%
ggplot(aes(total_cup_points, variety)) +
geom_boxplot()
coffee_lumped %>%
ggplot(aes(total_cup_points, fill = variety)) +
geom_histogram(binwidth = 1) +
facet_wrap(~ variety, scale = "free_y") + # adapt y axis to each variety
theme(legend.position = "none")
coffee_ratings %>%
summarize(
across(
everything(), ~ mean(!is.na(.))
)
) %>%
gather()
## # A tibble: 44 x 2
## key value
## <chr> <dbl>
## 1 total_cup_points 1
## 2 species 1
## 3 owner 0.995
## 4 country_of_origin 0.999
## 5 farm_name 0.732
## 6 lot_number 0.206
## 7 mill 0.765
## 8 ico_number 0.887
## 9 company 0.844
## 10 altitude 0.831
## # … with 34 more rows
coffee_ratings %>%
count(producer, sort = TRUE)
## # A tibble: 692 x 2
## producer n
## <chr> <int>
## 1 <NA> 231
## 2 La Plata 30
## 3 Ipanema AgrÃcola SA 22
## 4 Doi Tung Development Project 17
## 5 Ipanema Agricola 12
## 6 VARIOS 12
## 7 Ipanema Agricola S.A 11
## 8 ROBERTO MONTERROSO 10
## 9 AMILCAR LAPOLA 9
## 10 LA PLATA 9
## # … with 682 more rows
coffee_ratings %>%
count(company, sort = TRUE)
## # A tibble: 282 x 2
## company n
## <chr> <int>
## 1 <NA> 209
## 2 unex guatemala, s.a. 86
## 3 ipanema coffees 50
## 4 exportadora de cafe condor s.a 40
## 5 kona pacific farmers cooperative 40
## 6 racafe & cia s.c.a 40
## 7 blossom valley宸嶧國際 25
## 8 carcafe ltda 25
## 9 nucoffee 24
## 10 taiwan coffee laboratory 20
## # … with 272 more rows
coffee_ratings %>%
count(color, sort = TRUE)
## # A tibble: 5 x 2
## color n
## <chr> <int>
## 1 Green 869
## 2 <NA> 218
## 3 Bluish-Green 114
## 4 Blue-Green 85
## 5 None 52
coffee_ratings %>%
count(country = fct_lump(country_of_origin, 12), sort = TRUE) %>%
filter(!is.na(country)) %>%
mutate(country = fct_reorder(country, n)) %>%
ggplot(aes(n, country)) +
geom_col()
coffee_ratings %>%
filter(!is.na(country_of_origin)) %>%
mutate(country = fct_lump(country_of_origin, 12),
country = fct_reorder(country, total_cup_points)) %>%
ggplot(aes(total_cup_points, country)) +
geom_boxplot()
Interesting dimensions:
library(ggridges)
coffee_metrics <- coffee_ratings %>%
select(coffee_id, total_cup_points, variety, company,
country_of_origin,
altitude_mean_meters,
aroma:moisture) %>%
pivot_longer(aroma:cupper_points, names_to = "metric", values_to = "value")
coffee_metrics %>%
mutate(metric = fct_reorder(metric, value)) %>%
ggplot(aes(value, metric)) +
geom_density_ridges() # geom_density for categorical variables in a dataset
coffee_metrics %>%
group_by(metric) %>%
summarize(average = mean(value),
sd = sd(value)) %>%
arrange(desc(average))
## # A tibble: 10 x 3
## metric average sd
## <chr> <dbl> <dbl>
## 1 sweetness 9.86 0.554
## 2 clean_cup 9.84 0.715
## 3 uniformity 9.84 0.485
## 4 aroma 7.57 0.316
## 5 acidity 7.54 0.319
## 6 flavor 7.53 0.341
## 7 balance 7.52 0.354
## 8 body 7.52 0.308
## 9 cupper_points 7.51 0.427
## 10 aftertaste 7.41 0.350
library(widyr) # finds correlation into a messy dataset
library(ggraph) # show network plot to find out clusters
library(igraph)
library(tidytext)
correlations <- coffee_metrics %>%
# do not understand that step
pairwise_cor(item = metric, feature = coffee_id, value = value, sort = TRUE)
correlations %>%
head(50) %>%
# show clusters between correlated variables
graph_from_data_frame() %>%
ggraph() +
geom_edge_link(aes(edge_alpha = correlation)) +
geom_node_point() +
geom_node_text(aes(label = name), repel = TRUE)
# time to apply a pca
coffee_metrics %>%
filter(!metric %in% c("sweetness", "clean_cup", "uniformity")) %>%
group_by(metric) %>%
mutate(centered = value - mean(value)) %>% # center the values
ungroup() %>%
widely_svd(item = metric, feature = coffee_id, value = value) %>%
filter(between(dimension, 2, 5)) %>%
mutate(metric = reorder_within(metric, value, dimension)) %>%
ggplot(aes(value, metric)) +
geom_col() +
scale_y_reordered() +
facet_wrap(~ dimension, scales = "free_y")
coffee_ratings %>%
filter(altitude_mean_meters < 10000,
altitude != 1) %>%
mutate(altitude_mean_meters = pmin(altitude_mean_meters, 3000)) %>%
ggplot(aes(altitude_mean_meters, total_cup_points)) +
geom_point() +
geom_smooth(method = "lm")
coffee_metrics %>%
filter(altitude_mean_meters < 10000) %>%
mutate(altitude_mean_meters = pmin(altitude_mean_meters, 3000)) %>%
mutate(km = altitude_mean_meters / 1000) %>%
group_by(metric) %>%
summarize(correlation = cor(altitude_mean_meters, value),
model = list(lm(value ~ km))) %>%
mutate(tidied = map(model, broom::tidy, conf.int = TRUE)) %>%
unnest(tidied) %>%
filter(term == "km") %>%
ungroup() %>%
mutate(metric = fct_reorder(metric, estimate)) %>%
ggplot(aes(estimate, metric, color = p.value < .05)) +
geom_point() +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = .1) +
labs(y = "Evaluation of coffee",
x = "Each kilometer of altitude contributes this much to score (95% confidence interval)")
# libraries
library(tidytuesdayR)
library(tidyverse)
library(lubridate)
# set plot theme
theme_set(theme_light())
tuesdata <- tidytuesdayR::tt_load('2020-04-07')
##
## Downloading file 1 of 3: `stage_data.csv`
## Downloading file 2 of 3: `tdf_stages.csv`
## Downloading file 3 of 3: `tdf_winners.csv`
# look at the columns on the dataset
# let's analyse the birth winners
tdf_winners <- tuesdata$tdf_winners %>%
mutate(year = year(start_date),
speed = distance / time_overall)
stage_data <- tuesdata$stage_data
tdf_stages <- tuesdata$tdf_stages %>%
janitor::clean_names() %>% # put all the titles lower
mutate(year = year(date))
# exploring winners data
tdf_winners %>%
count(birth_country, sort = TRUE) %>%
mutate(birth_country = fct_reorder(birth_country, n)) %>%
ggplot(aes(y = birth_country, x = n)) +
geom_col() +
labs(y = "",
title = "What the counties are the most Tour the France winners born in?")
tdf_winners %>%
count(winner_name, birth_country, sort = TRUE)
## # A tibble: 63 x 3
## winner_name birth_country n
## <chr> <chr> <int>
## 1 Lance Armstrong USA 7
## 2 Bernard Hinault France 5
## 3 Eddy Merckx Belgium 5
## 4 Jacques Anquetil France 5
## 5 Miguel Induráin Spain 5
## 6 Chris Froome Kenya 4
## 7 Greg LeMond USA 3
## 8 Louison Bobet France 3
## 9 Philippe Thys Belgium 3
## 10 Alberto Contador Spain 2
## # … with 53 more rows
What has change over time? This question leads to how is the age disitributed over time?
tdf_winners %>%
group_by(decade = 10 * (year %/% 10)) %>% # truncated division to get the decade
summarise(winner_age = mean(age),
winner_height = mean(height, na.rm = TRUE),
winner_weight = mean(weight, na.rm = TRUE)) %>%
ggplot(aes(x = decade, y = winner_age)) +
geom_line() +
expand_limits(y = 0)
There is no clear relation with age, weight or height over the year in the winners.
Now he is looking at the time margin:
library(lubridate)
by_decade <- tdf_winners %>%
group_by(decade = 10 * (year %/% 10)) %>% # truncated division to get the decade
summarise(winner_age = mean(age),
winner_height = mean(height, na.rm = TRUE),
winner_weight = mean(weight, na.rm = TRUE),
winner_margin = mean(time_margin, na.rm = TRUE),
winner_speed = mean(speed, na.rm = TRUE))
by_decade %>%
filter(decade >= 1910) %>% # there is only two data points in 1910
ggplot(aes(x = decade, y = winner_margin * 60)) + # transform to minutes
geom_line() +
expand_limits(y = 0) +
labs(x = "Decade",
y = "Average margin of winner (minutes)",
title = "Tour the France has been getting closer")
This chart is meaningful.
NOw let’s look at the time of the winner:
by_decade %>%
ggplot(aes(x = decade, y = winner_speed)) +
geom_line() +
expand_limits(y = 0) +
labs(x = "Decade",
y = "Average speed of the winner (km/h)",
title = "Tour the France winners have been getting faster")
They have started to flatter on the paste year, possible doping?. TO know more abou it, I would be looking for some variable that affects the speed, like elevation. But they are on present on the dataset. So now we know with the winners dataset they are getting faster and there are getting closer (less margin time).
Other things, average life expectancy of the winners. To do that, we will do a survivial analyses:
library(survival) # very good for medical proposals
library(broom)
surv_model <- tdf_winners %>%
distinct(winner_name, .keep_all = TRUE) %>%
transmute(winner_name,
birth_year = year(born),
death_year = year(died),
dead = as.integer(!is.na(death_year))) %>%
mutate(age_at_death = coalesce(death_year, 2020) - birth_year) %>% # They are not death, we are just saying they are alive in 2020
survival::survfit(Surv(age_at_death, dead) ~ 1, data = .)
glance(surv_model)
## # A tibble: 1 x 10
## records n.max n.start events rmean rmean.std.error median conf.low conf.high
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 63 63 63 38 69.6 2.69 77 71 82
## # … with 1 more variable: nobs <int>
Median life expectancy of a Tour of France winner is 77.
Now let’s start some exploration on the stage data:
# I want to join stage data, but I need to prepare them before doing it
stages_joined <- stage_data %>%
extract(col = stage_results_id,
into = "stage",
regex = "stage-(.*)") %>% # extract anything after "stage"
inner_join(tdf_stages, by = c("year", "stage")) %>% # I've lost a little bit of data
mutate(rank = as.integer(rank)) %>% # NA introduced cause of people not finishing the race
# add_count(year, stage, name = "competitors") # adding competitors by year and stage
# now I want to get the number of finishers
group_by(year, stage) %>%
mutate(finishers = sum(!is.na(rank))) %>%
ungroup() %>%
mutate(percentile = 1 - rank / finishers)
stages_joined %>%
filter(stage == "1") %>%
group_by(winner_country) %>%
summarise(stages = n(),
median_percentile = median(percentile, na.rm = TRUE)) %>%
arrange(desc(stages))
## # A tibble: 18 x 3
## winner_country stages median_percentile
## <chr> <int> <dbl>
## 1 FRA 3215 0.494
## 2 BEL 2760 0.496
## 3 ITA 1637 0.497
## 4 SUI 800 0.5
## 5 GER 680 0.495
## 6 NED 655 0.498
## 7 GBR 576 0.497
## 8 AUS 387 0.497
## 9 LUX 376 0.496
## 10 EST 368 0.497
## 11 FRG 280 0.496
## 12 CAN 198 0.497
## 13 POR 198 0.497
## 14 SVK 198 0.497
## 15 URS 198 0.497
## 16 USA 189 0.497
## 17 UZB 189 0.497
## 18 ESP 180 0.514
This table tell us that no country does it better than the average at the first stage.
How can I find the relation between the results at the first stage and how is affecting at the final results? Does the winner of the first stage predict their final point ranking?
total_points <- stages_joined %>%
group_by(year, rider) %>%
summarise(total_points = sum(points, na.rm = TRUE)) %>%
mutate(points_rank = percent_rank(total_points)) %>%
ungroup()
stages_joined %>%
filter(stage == "1") %>%
inner_join(total_points, by = c("year", "rider")) %>%
select(year, rider, percentile_first_stage = percentile,
points_rank) %>%
mutate(first_stage_bin = cut(percentile_first_stage, seq(0, 1, .1))) %>%
filter(!is.na(first_stage_bin)) %>%
ggplot(aes(y = first_stage_bin, x = points_rank)) +
geom_boxplot() +
labs(y = "Decile performance in the first stage",
x = "Overall points percentile")
So yes, there is a relation between your results at the first stage and the final results.
Now we are going to use a gganimate to show a race:
library(gganimate)
top_10_2017 <- total_points %>%
filter(year == max(year)) %>%
top_n(10, total_points)
stages_joined %>%
filter(year == max(year)) %>%
semi_join(top_10_2017, by = "rider") %>% # now I have all the data for the top 10 riders accross all the time
mutate(stage = as.integer(stage),
points = coalesce(points, 0)) %>%
group_by(rider) %>%
mutate(cumulative_points = cumsum(points)) %>%
ungroup() %>%
ggplot(aes(cumulative_points, rider, fill = cumulative_points)) +
geom_col() +
transition_time(stage) +
theme(legend.position = "none") +
labs(title = "The 2018 Tour France: Stage: {frame_time}",
x = "Cumulative points at this stage",
y = "")
library(tidyverse)
library(tidytuesdayR)
library(schrute)
# set plot theme
theme_set(theme_light())
office_data <- tt_load("2020-03-17")
##
## Downloading file 1 of 1: `office_ratings.csv`
office_ratings <- office_data$office_ratings %>%
mutate(name = str_to_lower(str_remove_all(title, "\\.| \\(Part.*|\\: Part.*")))
office_transcripts <- as_tibble(theoffice) %>%
mutate(season = as.integer(season),
episode = as.integer(episode)) %>%
mutate(character = str_remove_all(character, '"')) %>%
mutate(name = str_to_lower(str_remove_all(episode_name, "\\.| \\(Part.*")))
library(ggrepel)
office_ratings %>%
group_by(season) %>%
summarize(avg_rating = mean(imdb_rating)) %>%
ggplot(aes(season, avg_rating)) +
geom_line() +
scale_x_continuous(breaks = 1:9)
office_ratings %>%
mutate(title = fct_inorder(title),
episode_number = row_number()) %>%
ggplot(aes(episode_number, imdb_rating)) +
geom_line() +
geom_smooth() +
geom_point(aes(color = factor(season), size = total_votes)) +
geom_text(aes(label = title), check_overlap = TRUE, hjust = 1) +
expand_limits(x = -10) +
theme(panel.grid.major.x = element_blank(),
legend.position = "none") +
labs(x = "Episode number",
y = "IMDB Rating",
title = "Popularity of The Office episodes over time",
subtitle = "Color represents season, size represents # of ratings")
office_ratings %>%
arrange(desc(imdb_rating)) %>%
mutate(title = paste0(season, ".", episode, " ", title),
title = fct_reorder(title, imdb_rating)) %>%
head(20) %>%
ggplot(aes(title, imdb_rating, color = factor(season), size = total_votes)) +
geom_point() +
coord_flip() +
labs(color = "Season",
title = "Most popular episodes of The Office")
library(tidytext)
blacklist <- c("yeah", "hey", "uh", "gonna")
blacklist_characters <- c("Everyone", "All", "Both", "Guy", "Girl", "Group")
transcript_words <- office_transcripts %>%
group_by(character) %>%
filter(n() >= 100,
n_distinct(episode_name) > 2) %>%
ungroup() %>%
select(-text_w_direction) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words, by = "word") %>%
filter(!word %in% blacklist,
!character %in% blacklist_characters)
character_tf_idf <- transcript_words %>%
add_count(word) %>%
filter(n >= 20) %>%
count(word, character) %>%
# which words are more specific to each character
bind_tf_idf(word, character, n) %>%
arrange(desc(tf_idf))
character_tf_idf %>%
filter(character %in% c("Dwight", "Jim", "David Wallace", "Darryl", "Jan", "Holly")) %>%
group_by(character) %>%
top_n(10, tf_idf) %>%
ungroup() %>%
mutate(word = reorder_within(word, tf_idf, character)) %>%
ggplot(aes(word, tf_idf)) +
geom_col() +
coord_flip() +
scale_x_reordered() +
facet_wrap(~ character, scales = "free_y") +
labs(x = "",
y = "TF-IDF of character-word pairs")
office_transcripts %>%
count(character, sort = TRUE) %>%
filter(character == "Dwight")
## # A tibble: 1 x 2
## character n
## <chr> <int>
## 1 Dwight 6847
What affects popularity of an episode:
ratings_summarized <- office_ratings %>%
group_by(name) %>%
summarize(imdb_rating = mean(imdb_rating))
character_lines_ratings <- office_transcripts %>%
filter(!character %in% blacklist_characters) %>%
count(character, name) %>%
group_by(character) %>%
filter(sum(n) >= 50,
n() >= 5) %>%
inner_join(ratings_summarized, by = "name")
character_lines_ratings %>%
summarize(avg_rating = mean(imdb_rating),
nb_episodes = n()) %>%
arrange(desc(avg_rating))
## # A tibble: 39 x 3
## character avg_rating nb_episodes
## <chr> <dbl> <int>
## 1 Carol 8.7 6
## 2 Charles 8.62 6
## 3 Karen 8.6 25
## 4 Holly 8.57 16
## 5 Jan 8.48 37
## 6 Michael 8.42 130
## 7 David Wallace 8.42 16
## 8 David 8.38 27
## 9 Roy 8.36 28
## 10 Josh 8.34 7
## # … with 29 more rows
director_writer_features <- office_transcripts %>%
distinct(name, director, writer) %>%
gather(type, value, director, writer) %>%
separate_rows(value, sep = ";") %>%
unite(feature, type, value, sep = ": ") %>%
group_by(feature) %>%
filter(n() >= 3) %>%
mutate(value = 1) %>%
ungroup()
character_line_features <- character_lines_ratings %>%
ungroup() %>%
transmute(name, feature = character, value = log2(n))
season_features = office_ratings %>%
distinct(name, season) %>%
transmute(name, feature = paste("season:", season), value = 1)
features <- bind_rows(director_writer_features,
character_line_features,
season_features) %>%
semi_join(office_ratings, by = "name") %>%
semi_join(office_transcripts, by = "name")
episode_feature_matrix <- features %>%
cast_sparse(name, feature, value)
ratings <- ratings_summarized$imdb_rating[match(rownames(episode_feature_matrix), ratings_summarized$name)]
library(glmnet)
library(broom)
mod <- cv.glmnet(episode_feature_matrix, ratings)
plot(mod)
tidy(mod$glmnet.fit) %>%
filter(lambda == mod$lambda.min,
term != "(Intercept)") %>%
mutate(term = fct_reorder(term, estimate)) %>%
ggplot(aes(term, estimate, fill = estimate > 0)) +
geom_col() +
coord_flip() +
labs(y = "Estimated effect on the rating of an episode") +
theme(legend.position = "none")
library(tidyverse)
library(tidytuesdayR)
theme_set(theme_light())
# load and cleaning step
wine_ratings <- tt_load("2019-05-28")$`winemag-data-130k-v2` %>%
select(-X1) %>%
# extract 4 digitis which starts with 20, in a new column called year,
extract(title, "year", "(20\\d\\d)", convert = TRUE, remove = FALSE) %>%
mutate(year = ifelse(year < 1900, NA, year)) %>%
filter(!is.na(price))
##
## Downloading file 1 of 1: `winemag-data-130k-v2.csv`
wine_ratings %>%
count(country, sort = T)
## # A tibble: 43 x 2
## country n
## <chr> <int>
## 1 US 54265
## 2 France 17776
## 3 Italy 16914
## 4 Spain 6573
## 5 Portugal 4875
## 6 Chile 4416
## 7 Argentina 3756
## 8 Austria 2799
## 9 Australia 2294
## 10 Germany 2120
## # … with 33 more rows
wine_ratings %>%
count(designation, sort = T)
## # A tibble: 35,777 x 2
## designation n
## <chr> <int>
## 1 <NA> 34779
## 2 Reserve 1980
## 3 Estate 1318
## 4 Reserva 1219
## 5 Estate Grown 618
## 6 Riserva 607
## 7 Brut 472
## 8 Dry 406
## 9 Estate Bottled 342
## 10 Crianza 338
## # … with 35,767 more rows
wine_ratings %>%
count(country, region_1, sort = TRUE)
## # A tibble: 1,247 x 3
## country region_1 n
## <chr> <chr> <int>
## 1 Portugal <NA> 4875
## 2 US Napa Valley 4475
## 3 Chile <NA> 4416
## 4 US Columbia Valley (WA) 4109
## 5 US Russian River Valley 3090
## 6 Austria <NA> 2799
## 7 US California 2627
## 8 US Paso Robles 2327
## 9 US Willamette Valley 2296
## 10 Argentina Mendoza 2275
## # … with 1,237 more rows
wine_ratings %>%
count(taster_name, sort = TRUE)
## # A tibble: 20 x 2
## taster_name n
## <chr> <int>
## 1 <NA> 24496
## 2 Roger Voss 20172
## 3 Michael Schachner 14951
## 4 Kerin O’Keefe 9874
## 5 Virginie Boone 9507
## 6 Paul Gregutt 9498
## 7 Matt Kettmann 6237
## 8 Joe Czerwinski 5012
## 9 Sean P. Sullivan 4925
## 10 Anna Lee C. Iijima 4369
## 11 Jim Gordon 4171
## 12 Anne Krebiehl MW 3398
## 13 Lauren Buzzeo 1713
## 14 Susan Kostrzewa 1073
## 15 Mike DeSimone 504
## 16 Jeff Jenssen 491
## 17 Alexander Peartree 413
## 18 Carrie Dykes 138
## 19 Fiona Adams 27
## 20 Christina Pickard 6
wine_ratings %>%
filter(!is.na(designation)) %>%
count(variety, designation, sort = TRUE)
## # A tibble: 44,805 x 3
## variety designation n
## <chr> <chr> <int>
## 1 Cabernet Sauvignon Reserve 470
## 2 Chardonnay Reserve 324
## 3 Pinot Noir Reserve 279
## 4 Riesling Dry 231
## 5 Sangiovese Riserva 229
## 6 Sparkling Blend Brut 224
## 7 Pinot Noir Estate 208
## 8 Tempranillo Crianza 190
## 9 Cabernet Sauvignon Estate 187
## 10 Red Blend Riserva 182
## # … with 44,795 more rows
wine_ratings %>%
ggplot(aes(year)) +
geom_histogram()
wine_ratings %>%
ggplot(aes(points)) +
geom_histogram(binwidth = 1)
wine_ratings %>%
ggplot(aes(price)) +
geom_histogram() +
# us a log scale to see if iti is a normal distribution
scale_x_log10()
ggplot(wine_ratings, aes(price, points)) +
geom_point(alpha = .1) +
geom_smooth(method = "lm") +
scale_x_log10()
# predict the points by the price of a wine
summary(lm(points ~ log2(price), wine_ratings))
##
## Call:
## lm(formula = points ~ log2(price), data = wine_ratings)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.0559 -1.5136 0.1294 1.7133 9.2408
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78.981419 0.035765 2208 <2e-16 ***
## log2(price) 1.974162 0.007338 269 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.408 on 120973 degrees of freedom
## Multiple R-squared: 0.3744, Adjusted R-squared: 0.3744
## F-statistic: 7.239e+04 on 1 and 120973 DF, p-value: < 2.2e-16
Every time the price doubles, the expected number of points goes up by 2.
library(broom)
model <- wine_ratings %>%
replace_na(list(taster_name = "Missing", country = "Missing")) %>%
mutate(country = fct_relevel(fct_lump(country, 7), "US"),
taster_name = fct_relevel(fct_lump(taster_name, 6), "Missing")) %>%
lm(points ~ log2(price) + country + year + taster_name, data = .)
model %>%
tidy(conf.int = TRUE) %>%
filter(term != "(Intercept)") %>%
mutate(term = str_replace(term, "country", "Country: "),
term = str_replace(term, "taster_name", "Taster: "),
term = fct_reorder(term, estimate)) %>%
ggplot(aes(estimate, term)) +
geom_point() +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high))
# model %>%
# augment(data = wine_ratings) %>%
# ggplot(aes(.fitted, points)) +
# geom_point(alpha = .1)
# tidy(anova(model)) %>%
# mutate(sumsq / sum(sumsq))
library(tidytext)
wine_rating_words <- wine_ratings %>%
mutate(wine_id = row_number()) %>%
unnest_tokens(word, description) %>%
anti_join(stop_words, by = "word") %>%
filter(!word %in% c("wine", "drink"),
str_detect(word, "[a-z]"))
wine_rating_words %>%
count(word, sort = TRUE) %>%
head(20) %>%
mutate(word = fct_reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
coord_flip()
library(widyr)
wine_words_filtered <- wine_rating_words %>%
distinct(wine_id, word) %>%
add_count(word) %>%
filter(n >= 100)
wine_words_filtered %>%
pairwise_cor(word, wine_id, sort = TRUE)
## # A tibble: 5,501,370 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 verde vinho 0.985
## 2 vinho verde 0.985
## 3 verdot petit 0.969
## 4 petit verdot 0.969
## 5 nacional touriga 0.956
## 6 touriga nacional 0.956
## 7 sirah petite 0.956
## 8 petite sirah 0.956
## 9 smith granny 0.921
## 10 granny smith 0.921
## # … with 5,501,360 more rows
library(Matrix)
wine_word_matrix <- wine_words_filtered %>%
cast_sparse(wine_id, word)
wine_ids <- as.integer(rownames(wine_word_matrix))
scores <- wine_ratings$points[wine_ids]
library(glmnet)
wine_word_matrix_extra <- cbind(wine_word_matrix, log_price = log2(wine_ratings$price[wine_ids]))
library(doMC)
registerDoMC(cores = 4)
cv_glmnet_model <- cv.glmnet(wine_word_matrix_extra, scores, parallel = TRUE)
plot(cv_glmnet_model)
lexicon <- cv_glmnet_model$glmnet.fit %>%
tidy() %>%
filter(lambda == cv_glmnet_model$lambda.1se,
term != "(Intercept)",
term != "log_price") %>%
select(word = term, coefficient = estimate)
lexicon %>%
arrange(coefficient) %>%
group_by(direction = ifelse(coefficient < 0, "Negative", "Positive")) %>%
top_n(16, abs(coefficient)) %>%
ungroup() %>%
mutate(word = fct_reorder(word, coefficient)) %>%
ggplot(aes(word, coefficient, fill = direction)) +
geom_col() +
coord_flip() +
labs(x = "",
y = "Estimated effect of the word on the score",
title = "What words are predictive of a wine's score?")
wine_ratings %>%
mutate(wine_id = row_number()) %>%
arrange(points) %>%
head(1) %>%
select(wine_id, description) %>%
pull(description)
## [1] "Aromas of pumpkin, squash and corn chips are stale and not inviting. There's an acceptable mouthfeel to this weird, unbalanced Chardonnay along with flavors of spiced squash, mealy apple and sautéed root vegetables."
wine_rating_words %>%
filter(wine_id %in% sample(unique(wine_id), 6)) %>%
distinct(word, title, points) %>%
mutate(wine = paste0(str_trunc(title, 40), " (", points, ")")) %>%
inner_join(lexicon, by = "word") %>%
mutate(word = fct_reorder(word, coefficient)) %>%
ggplot(aes(word, coefficient, fill = coefficient > 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
facet_wrap(~ wine, scales = "free_y") +
labs(title = "How a lasso regression would predict each wine's score",
subtitle = "Using a lasso regression with an extra term for price",
x = "",
y = "Effect on score")
cv_glmnet_model$glmnet.fit %>%
tidy() %>%
filter(term %in% c("rich", "black", "simple", "complex", "vineyard", "concentrated")) %>%
ggplot(aes(lambda, estimate, color = term)) +
geom_line() +
scale_x_log10() +
geom_hline(lty = 2, yintercept = 0)
cv_glmnet_model$glmnet.fit %>%
tidy() %>%
count(lambda) %>%
ggplot(aes(lambda, n)) +
geom_line() +
scale_x_log10()
wine_ratings %>%
mutate(country = fct_relevel(fct_lump(country, 7), "US")) %>%
mutate(country = fct_reorder(country, points)) %>%
ggplot(aes(country, points)) +
geom_boxplot() +
coord_flip()
wine_ratings %>%
group_by(year) %>%
summarize(average_points = mean(points), n())
## # A tibble: 19 x 3
## year average_points `n()`
## <int> <dbl> <int>
## 1 2000 87.3 735
## 2 2001 87.4 668
## 3 2002 87.6 333
## 4 2003 88.4 499
## 5 2004 88.7 1604
## 6 2005 88.2 3293
## 7 2006 88.1 5170
## 8 2007 88.1 6498
## 9 2008 88.1 6725
## 10 2009 88.3 9056
## 11 2010 88.2 11105
## 12 2011 88.2 11437
## 13 2012 88.8 14742
## 14 2013 89.0 15196
## 15 2014 88.9 14892
## 16 2015 88.5 9642
## 17 2016 87.7 3549
## 18 2017 85.5 11
## 19 NA 87.6 5820
wine_ratings %>%
mutate(reviewer = fct_reorder(fct_lump(taster_name, 10), points)) %>%
ggplot(aes(reviewer, points)) +
geom_boxplot() +
coord_flip()