library(tidyverse)
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Registered S3 methods overwritten by 'ggplot2':
method from
[.quosures rlang
c.quosures rlang
print.quosures rlang
[30m── [1mAttaching packages[22m ────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 3.1.1 [32m✔[30m [34mpurrr [30m 0.3.2
[32m✔[30m [34mtibble [30m 2.1.1 [32m✔[30m [34mdplyr [30m 0.8.1
[32m✔[30m [34mtidyr [30m 0.8.3 [32m✔[30m [34mstringr[30m 1.4.0
[32m✔[30m [34mreadr [30m 1.3.1 [32m✔[30m [34mforcats[30m 0.4.0[39m
[30m── [1mConflicts[22m ───────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
library(janitor)
Attaching package: ‘janitor’
The following objects are masked from ‘package:stats’:
chisq.test, fisher.test
wine_ratings <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-28/winemag-data-130k-v2.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
X1 = [32mcol_double()[39m,
country = [31mcol_character()[39m,
description = [31mcol_character()[39m,
designation = [31mcol_character()[39m,
points = [32mcol_double()[39m,
price = [32mcol_double()[39m,
province = [31mcol_character()[39m,
region_1 = [31mcol_character()[39m,
region_2 = [31mcol_character()[39m,
taster_name = [31mcol_character()[39m,
taster_twitter_handle = [31mcol_character()[39m,
title = [31mcol_character()[39m,
variety = [31mcol_character()[39m,
winery = [31mcol_character()[39m
)
clean_names(wine_ratings)
NA
Look at distribution of scores based on evaluator, remove wine if taster name, price, points NA
known_taster <- wine_ratings %>% filter(!is.na(taster_name))
known_price_taster <- known_taster %>% filter(!is.na(price))
known_points_price_taster <- known_price_taster %>% filter(!is.na(price))
create rating/price ratio
rp_ratio_df <- mutate(known_points_price_taster, rp_ratio = points/price)
tally(group_by(rp_ratio_df, taster_name))
Group by taster and rp ratio
ggplot(rp_ratio_df) +
aes(x = taster_name) +
aes(y = rp_ratio) +
geom_jitter(alpha = .5, height = 0, width = .25) +
aes(col = taster_name) +
geom_boxplot(alpha = .25) +
aes(fill = taster_name)
ggplot(rp_ratio_df) +
aes(x = taster_name) +
aes(y = points) +
geom_jitter() +
aes(col = taster_name) +
geom_boxplot()
ratio by country
ggplot(rp_ratio_df) +
aes(x = country) +
aes(y = rp_ratio) +
geom_jitter() +
geom_boxplot()
(country_count <- tally(group_by(rp_ratio_df, country)))
NA
Merge to have count in the data and rename the column
country_count <- merge(country_count,rp_ratio_df)
colnames(country_count)
[1] "country" "n"
[3] "X1" "description"
[5] "designation" "points"
[7] "price" "province"
[9] "region_1" "region_2"
[11] "taster_name" "taster_twitter_handle"
[13] "title" "variety"
[15] "winery" "rp_ratio"
rename n ufos <- ufos %>% rename(spotter.comments = comments)
country_count <- country_count %>% rename(total_count = n)
add factor level of small <100, mid 100-999, large >1000
country_count <- mutate(country_count, producer_level = ifelse(total_count %in% 0:99, "small",
ifelse(total_count%in% 100:999, "medium", ifelse(total_count %in% 1000:9999, "large",
ifelse(total_count %in% 10000:100000, "massive", "other"
)))))
Fix factor country_count %>% as.factor(country_count$producer_level, levels=c(“small”, “medium”, “large”, “massive”))
country_count$producer_level <- factor(country_count$producer_level, levels=c("small", "medium", "large", "massive"), ordered=TRUE)
class(country_count$producer_level)
[1] "ordered" "factor"
tally(group_by(country_count, producer_level))
tally by production level
by_producer_level <- tally(group_by(country_count, producer_level))
ggplot(country_count) +
aes(x = producer_level) +
aes(y = points) +
geom_boxplot()+
labs(title= "Meh: Surprisingly similar scores", subtitle ="Variability of Wine Score by Country Representation", x = "Countries Sorted by Review Representation Category", y = "Points")
small countries points
small_countries <- filter(country_count, producer_level == "small")
ggplot(small_countries) +
aes(x=country) +
aes(y=rp_ratio) +
geom_boxplot()