knitr::opts_chunk$set(echo = TRUE, message=FALSE, warning = FALSE, error = FALSE)
library(htmlwidgets)
library(tidyverse)
library(stringr)
library(skimr)
rainfall <- read_csv("/Users/Rose/Desktop/Masters In Data Science/Data Science with R/rainfall.csv")
wine <- read_csv("/Users/Rose/Desktop/Masters In Data Science/Data Science with R/winemag-data.csv")
library(htmlwidgets)
wine_red_example <- wine %>%
mutate(red_wine =
case_when(str_detect(description, "[Rr]ed") ~ "Red")) %>%
filter(!is.na(red_wine)) %>%
select(red_wine, points, price)
head(wine)
## # A tibble: 6 x 14
## X1 country description designation points price province region_1 region_2
## <int> <chr> <chr> <chr> <int> <dbl> <chr> <chr> <chr>
## 1 0 Italy Aromas inc… Vulkà Bian… 87 NA Sicily … Etna <NA>
## 2 1 Portug… This is ri… Avidagos 87 15 Douro <NA> <NA>
## 3 2 US Tart and s… <NA> 87 14 Oregon Willame… Willame…
## 4 3 US Pineapple … Reserve La… 87 13 Michigan Lake Mi… <NA>
## 5 4 US Much like … Vintner's … 87 65 Oregon Willame… Willame…
## 6 5 Spain Blackberry… Ars In Vit… 87 15 Norther… Navarra <NA>
## # … with 5 more variables: taster_name <chr>, taster_twitter_handle <chr>,
## # title <chr>, variety <chr>, winery <chr>
wine %>%
mutate(wine_type =
case_when(str_detect(description, "[Rr]ed") ~ "Red",
str_detect(description, "[Ww]hite") ~ "White")) %>%
filter(!is.na(wine_type)) %>%
ggplot(aes(x = wine_type, y = points)) +
geom_boxplot() +
coord_flip()
#white_red_wine %>%
# filter(!is.na(white_wine) & !is.na(red_wine))
wine %>%
filter(country == "France") %>%
filter(price > 200) %>%
select(points, price, variety) %>%
arrange(desc(points), price, variety)
## # A tibble: 306 x 3
## points price variety
## <int> <dbl> <chr>
## 1 100 250 Champagne Blend
## 2 100 259 Champagne Blend
## 3 100 359 Bordeaux-style Red Blend
## 4 100 617 Chardonnay
## 5 100 848 Bordeaux-style White Blend
## 6 100 1500 Bordeaux-style Red Blend
## 7 100 1500 Bordeaux-style Red Blend
## 8 99 560 Chardonnay
## 9 99 800 Chardonnay
## 10 98 305 Champagne Blend
## # … with 296 more rows
glimpse(wine)
## Rows: 129,971
## Columns: 14
## $ X1 <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1…
## $ country <chr> "Italy", "Portugal", "US", "US", "US", "Spain",…
## $ description <chr> "Aromas include tropical fruit, broom, brimston…
## $ designation <chr> "Vulkà Bianco", "Avidagos", NA, "Reserve Late H…
## $ points <int> 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87,…
## $ price <dbl> NA, 15, 14, 13, 65, 15, 16, 24, 12, 27, 19, 30,…
## $ province <chr> "Sicily & Sardinia", "Douro", "Oregon", "Michig…
## $ region_1 <chr> "Etna", NA, "Willamette Valley", "Lake Michigan…
## $ region_2 <chr> NA, NA, "Willamette Valley", NA, "Willamette Va…
## $ taster_name <chr> "Kerin O’Keefe", "Roger Voss", "Paul Gregutt", …
## $ taster_twitter_handle <chr> "@kerinokeefe", "@vossroger", "@paulgwine ", NA…
## $ title <chr> "Nicosia 2013 Vulkà Bianco (Etna)", "Quinta do…
## $ variety <chr> "White Blend", "Portuguese Red", "Pinot Gris", …
## $ winery <chr> "Nicosia", "Quinta dos Avidagos", "Rainstorm", …
skim(wine)
## Skim summary statistics
## n obs: 129971
## n variables: 14
## group variables:
##
## ── Variable type:character ─────────────────────────────────────────────────────
## variable missing complete n min max empty n_unique
## country 63 129908 129971 2 22 0 43
## description 0 129971 129971 20 829 0 119955
## designation 37465 92506 129971 1 95 0 37979
## province 63 129908 129971 3 31 0 425
## region_1 21247 108724 129971 3 50 0 1229
## region_2 79460 50511 129971 4 17 0 17
## taster_name 26244 103727 129971 10 18 0 19
## taster_twitter_handle 31213 98758 129971 6 16 0 15
## title 0 129971 129971 12 136 0 118840
## variety 1 129970 129971 4 35 0 707
## winery 0 129971 129971 1 54 0 16757
##
## ── Variable type:integer ───────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75
## points 0 129971 129971 88.45 3.04 80 86 88 91
## X1 0 129971 129971 64985 37519.54 0 32492.5 64985 97477.5
## p100 hist
## 100 ▁▃▅▇▃▂▁▁
## 129970 ▇▇▇▇▇▇▇▇
##
## ── Variable type:numeric ───────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75 p100 hist
## price 8996 120975 129971 35.36 41.02 4 17 25 42 3300 ▇▁▁▁▁▁▁▁
Note: Using markdown is an easy way to marry analysis…
The Advantages are: 1. Simple 2. Reproducable 3. Easy for others to follow
Example analysis:
my_var <- exp(log(42))
Note2: it’s also cool bc look: the value of my_var is 42
wine %>%
glimpse()
## Rows: 129,971
## Columns: 14
## $ X1 <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1…
## $ country <chr> "Italy", "Portugal", "US", "US", "US", "Spain",…
## $ description <chr> "Aromas include tropical fruit, broom, brimston…
## $ designation <chr> "Vulkà Bianco", "Avidagos", NA, "Reserve Late H…
## $ points <int> 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87,…
## $ price <dbl> NA, 15, 14, 13, 65, 15, 16, 24, 12, 27, 19, 30,…
## $ province <chr> "Sicily & Sardinia", "Douro", "Oregon", "Michig…
## $ region_1 <chr> "Etna", NA, "Willamette Valley", "Lake Michigan…
## $ region_2 <chr> NA, NA, "Willamette Valley", NA, "Willamette Va…
## $ taster_name <chr> "Kerin O’Keefe", "Roger Voss", "Paul Gregutt", …
## $ taster_twitter_handle <chr> "@kerinokeefe", "@vossroger", "@paulgwine ", NA…
## $ title <chr> "Nicosia 2013 Vulkà Bianco (Etna)", "Quinta do…
## $ variety <chr> "White Blend", "Portuguese Red", "Pinot Gris", …
## $ winery <chr> "Nicosia", "Quinta dos Avidagos", "Rainstorm", …
wine %>%
filter(variety == "Pinot Gris" & price < 10) %>%
select(points, price, variety) %>%
arrange(desc(points), price)
## # A tibble: 7 x 3
## points price variety
## <int> <dbl> <chr>
## 1 90 9 Pinot Gris
## 2 88 9 Pinot Gris
## 3 86 9 Pinot Gris
## 4 86 9 Pinot Gris
## 5 86 9 Pinot Gris
## 6 84 9 Pinot Gris
## 7 83 9 Pinot Gris
wine %>%
filter(variety == "Chardonnay" & province == "Oregon") %>%
select(points, price, variety) %>%
skim()
## Skim summary statistics
## n obs: 498
## n variables: 3
## group variables:
##
## ── Variable type:character ─────────────────────────────────────────────────────
## variable missing complete n min max empty n_unique
## variety 0 498 498 10 10 0 1
##
## ── Variable type:integer ───────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75 p100 hist
## points 0 498 498 89.72 2.92 80 88 90 92 96 ▁▁▂▇▆▇▃▁
##
## ── Variable type:numeric ───────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75 p100 hist
## price 0 498 498 34.94 18.92 7 22 30 44.25 125 ▅▇▃▂▁▁▁▁
Long Excercise * Suggest some Oregon wines in the region of + Chehalem Mountains
wine %>%
filter(str_detect(region_1, "[Cc]hehalem Mountains"))
## # A tibble: 334 x 14
## X1 country description designation points price province region_1 region_2
## <int> <chr> <chr> <chr> <int> <dbl> <chr> <chr> <chr>
## 1 523 US There's a … Steve's Re… 91 65 Oregon Chehale… Willame…
## 2 609 US Grüner Vel… Estate 87 20 Oregon Chehale… Willame…
## 3 791 US Just one b… Chehalem M… 90 85 Oregon Chehale… Willame…
## 4 867 US Fresh frui… Estate 90 40 Oregon Chehale… Willame…
## 5 1092 US From vines… Old Vines 94 50 Oregon Chehale… Willame…
## 6 1514 US The myster… Inchinnan 92 54 Oregon Chehale… Willame…
## 7 2386 US This pleas… <NA> 87 25 Oregon Chehale… Willame…
## 8 2429 US The Dion i… Dion 85 50 Oregon Chehale… Willame…
## 9 2430 US It's diffi… Dion Viney… 85 14 Oregon Chehale… Willame…
## 10 3232 US A blend of… Nuages 88 35 Oregon Chehale… Willame…
## # … with 324 more rows, and 5 more variables: taster_name <chr>,
## # taster_twitter_handle <chr>, title <chr>, variety <chr>, winery <chr>
wine %>%
filter(str_detect(region_1, "[Cc]hehalem Mountains")) %>%
select(points, price, variety, winery) %>%
arrange(desc(points), desc(price))
## # A tibble: 334 x 4
## points price variety winery
## <int> <dbl> <chr> <chr>
## 1 96 85 Pinot Noir Alloro
## 2 95 90 Pinot Noir Adelsheim
## 3 95 65 Pinot Noir Bergström
## 4 95 45 Pinot Noir Alloro
## 5 94 90 Pinot Noir Adelsheim
## 6 94 85 Pinot Noir Alloro
## 7 94 80 Pinot Noir Le Cadeau
## 8 94 65 Chardonnay Lachini
## 9 94 65 Pinot Noir Bergström
## 10 94 65 Pinot Noir Arbor Brook
## # … with 324 more rows
Chehalem_mts <- wine %>%
filter(str_detect(region_1, "[Cc]hehalem Mountains")) %>%
select(points, price, variety, winery)
Chehalem_mts %>%
group_by(winery) %>%
summarize('avg_points' = mean(points)) %>%
arrange(desc(avg_points)) %>%
top_n(5, avg_points)
## # A tibble: 5 x 2
## winery avg_points
## <chr> <dbl>
## 1 Domaine Divio 94
## 2 Alloro 93.2
## 3 Aberrant Cellars 93
## 4 Timothy Malone 92.7
## 5 Patricia Green Cellars 92.5
Chehalem_mts %>%
group_by(winery) %>%
summarize('avg_points' = mean(points)) %>%
arrange(desc(avg_points)) %>%
top_n(5, avg_points)
## # A tibble: 5 x 2
## winery avg_points
## <chr> <dbl>
## 1 Domaine Divio 94
## 2 Alloro 93.2
## 3 Aberrant Cellars 93
## 4 Timothy Malone 92.7
## 5 Patricia Green Cellars 92.5
Chehalem_mts %>%
filter(winery == "Domaine Divio" | winery == "Alloro" | winery == "Aberrant Cellars") %>%
filter(price < 50) %>%
arrange(desc(points), desc(price), variety, winery)
## # A tibble: 11 x 4
## points price variety winery
## <int> <dbl> <chr> <chr>
## 1 95 45 Pinot Noir Alloro
## 2 94 45 Pinot Noir Alloro
## 3 94 45 Pinot Noir Alloro
## 4 93 35 Pinot Noir Alloro
## 5 93 34 Chardonnay Alloro
## 6 93 34 Chardonnay Alloro
## 7 93 28 Chardonnay Alloro
## 8 92 45 Pinot Noir Alloro
## 9 92 35 Pinot Noir Alloro
## 10 92 28 Chardonnay Alloro
## 11 91 35 Pinot Noir Alloro
Chehalem_mts %>%
filter(winery == "Domaine Divio" | winery == "Alloro" | winery == "Aberrant Cellars") %>%
filter(price < 50) %>%
count(str_detect(variety, "[Pp]inot Noir"))
## # A tibble: 2 x 2
## `str_detect(variety, "[Pp]inot Noir")` n
## * <lgl> <int>
## 1 FALSE 4
## 2 TRUE 7
Chehalem_mts %>%
filter(winery == "Timothy Malone" | winery == "Aberrant Cellars" | winery == "Patricia Green Cellars") %>%
arrange(desc(winery), desc(points), price, variety)
## # A tibble: 7 x 4
## points price variety winery
## <int> <dbl> <chr> <chr>
## 1 94 20 Riesling Timothy Malone
## 2 93 20 Riesling Timothy Malone
## 3 91 37 Pinot Noir Timothy Malone
## 4 93 48 Pinot Noir Patricia Green Cellars
## 5 92 48 Pinot Noir Patricia Green Cellars
## 6 93 50 Pinot Noir Aberrant Cellars
## 7 93 50 Pinot Noir Aberrant Cellars
Chehalem_mts %>%
filter(winery == "Timothy Malone" | winery == "Aberrant Cellars" | winery == "Patricia Green Cellars") %>%
filter(price <= 50 & points >= 90) %>%
ggplot(aes(x = winery, fill = variety)) + geom_bar()
ggplot(aes(character)) + geom_bar()
Chehalem_mts %>%
filter(winery == "Alloro" | winery == "Aberrant Cellars" | winery == "Patricia Green Cellars") %>%
filter(price <= 50 & points >= 90) %>%
ggplot(aes(x = price, y = points, color = variety)) + geom_point()
Chehalem_mts %>%
filter(variety == "Pinot Noir" | variety == "Chardonnay" | variety == "Riesling") %>%
filter(price <= 50, points >= 92 & points <= 95) %>%
ggplot(aes(x = price, y = points, color = variety)) + geom_point()
wine %>%
filter(country == "France" | country == "Italy") %>%
top_n(10, price) %>%
select(points, price, country, title) %>%
arrange(desc(points))
## # A tibble: 10 x 4
## points price country title
## <int> <dbl> <chr> <chr>
## 1 100 1500 France Château Lafite Rothschild 2010 Pauillac
## 2 100 1500 France Château Cheval Blanc 2010 Saint-Émilion
## 3 98 1900 France Château Margaux 2009 Margaux
## 4 97 2000 France Château Pétrus 2011 Pomerol
## 5 96 1200 France Château Haut-Brion 2009 Pessac-Léognan
## 6 96 1300 France Château Mouton Rothschild 2009 Pauillac
## 7 96 2500 France Château Pétrus 2014 Pomerol
## 8 96 2500 France Domaine du Comte Liger-Belair 2010 La Romanée
## 9 96 2000 France Domaine du Comte Liger-Belair 2005 La Romanée
## 10 88 3300 France Château les Ormes Sorbet 2013 Médoc
dont forget that | means OR!!!
arrange price ascending
wine %>%
filter(province == "Oregon" & variety != "Chardonnay") %>%
top_n(5, points) %>%
select(points, price, title) %>%
arrange(price)
## # A tibble: 7 x 3
## points price title
## <int> <dbl> <chr>
## 1 97 65 Ken Wright 2012 Abbott Claim Vineyard Pinot Noir
## 2 99 75 Cayuse 2009 En Chamberlin Vineyard Syrah (Walla Walla Valley (OR…
## 3 99 75 Cayuse 2011 En Chamberlin Vineyard Syrah (Walla Walla Valley (OR…
## 4 98 75 Cayuse 2011 En Cerise Vineyard Syrah (Walla Walla Valley (OR))
## 5 97 80 Cayuse 2009 The Widowmaker Cabernet Sauvignon (Walla Walla Valle…
## 6 97 85 Cayuse 2009 Armada Vineyard Syrah (Walla Walla Valley (OR))
## 7 97 90 Cayuse 2011 Widowmaker En Chamberlin Vineyard Cabernet Sauvignon…
wine %>%
filter(province == "Oregon") %>%
filter(variety != "Pinot Noir" & variety != "Chardonnay") %>%
top_frac(.05, points) %>%
select(points, price, title) %>%
arrange(desc(points), price)
## # A tibble: 131 x 3
## points price title
## <int> <dbl> <chr>
## 1 99 75 Cayuse 2009 En Chamberlin Vineyard Syrah (Walla Walla Valley (O…
## 2 99 75 Cayuse 2011 En Chamberlin Vineyard Syrah (Walla Walla Valley (O…
## 3 98 75 Cayuse 2011 En Cerise Vineyard Syrah (Walla Walla Valley (OR))
## 4 97 80 Cayuse 2009 The Widowmaker Cabernet Sauvignon (Walla Walla Vall…
## 5 97 85 Cayuse 2009 Armada Vineyard Syrah (Walla Walla Valley (OR))
## 6 97 90 Cayuse 2011 Widowmaker En Chamberlin Vineyard Cabernet Sauvigno…
## 7 96 32 Trisaetum 2016 Ribbon Ridge Estate Dry Riesling (Ribbon Ridge)
## 8 96 38 Trisaetum 2015 Estates Reserve Riesling (Willamette Valley)
## 9 96 70 Cayuse 2012 Cailloux Vineyard Viognier (Walla Walla Valley (OR))
## 10 96 75 Cayuse 2009 Camaspelo Cabernet Sauvignon-Merlot (Walla Walla Va…
## # … with 121 more rows
wine_country <- wine %>%
filter(variety == "Cabernet Sauvignon" | variety == "Chardonnay" | variety == "Pinot Gris" | variety == "Syrah") %>%
group_by(country, variety) %>%
summarize(points = mean(points)) %>%
arrange(country)
wine_country
## # A tibble: 90 x 3
## # Groups: country [32]
## country variety points
## <chr> <chr> <dbl>
## 1 Argentina Cabernet Sauvignon 86.0
## 2 Argentina Chardonnay 84.9
## 3 Argentina Pinot Gris 84.9
## 4 Argentina Syrah 85.8
## 5 Australia Cabernet Sauvignon 89.3
## 6 Australia Chardonnay 87.3
## 7 Australia Pinot Gris 87.4
## 8 Australia Syrah 91.6
## 9 Austria Cabernet Sauvignon 87.4
## 10 Austria Chardonnay 90.3
## # … with 80 more rows
wide_wine <- wine_country %>%
pivot_wider(variety, points)
wide_wine
## # A tibble: 4 x 1
## variety
## <chr>
## 1 Cabernet Sauvignon
## 2 Chardonnay
## 3 Pinot Gris
## 4 Syrah
wine_long <- wide_wine %>%
pivot_longer("variety", "points", 2:5)
wine_long
## # A tibble: 4 x 2
## points value
## <chr> <chr>
## 1 variety Cabernet Sauvignon
## 2 variety Chardonnay
## 3 variety Pinot Gris
## 4 variety Syrah
wine %>%
filter(country == "US" | country == "France") %>%
filter(variety == "Chardonnay") %>%
count(country)
## # A tibble: 2 x 2
## country n
## * <chr> <int>
## 1 France 2808
## 2 US 6801
wine %>%
filter(country == "US" | country == "France") %>%
filter(variety == "Chardonnay") %>%
ggplot(aes(x = country)) + geom_bar()
wine %>%
filter(province == "Washington" | province == "Oregon" | province == "California") %>%
filter(variety == "Cabernet Sauvignon" | variety == "Syrah" | variety == "Pinot Noir") %>%
ggplot(aes(x = province, fill = variety)) + geom_bar()
wine %>%
filter(province == "Washington" | province == "Oregon" | province == "California") %>%
filter(points >= 97) %>%
ggplot(aes(x = province, fill = variety)) + geom_bar() + scale_fill_manual(values=c("#CC0000", "#006600", "#669999", "#00CCCC",
"#660099", "#CC0066", "#FF9999", "#FF9900",
"blue", "yellow", "blue2", "grey", "grey2"))
wine %>%
summarize(avg_points = mean(points))
## # A tibble: 1 x 1
## avg_points
## <dbl>
## 1 88.4
wine %>%
ggplot(aes(x = points)) + geom_histogram()
wine %>%
ggplot(aes(x = points, y = log(price))) +geom_point()
wine %>%
filter(country == "US") %>%
filter(!is.na(price)) %>%
group_by(province) %>%
summarize(
count = n(),
average_points = mean(points),
average_price = mean(price)) %>%
filter(count > 100) %>%
arrange(desc(average_points))
## # A tibble: 7 x 4
## province count average_points average_price
## <chr> <int> <dbl> <dbl>
## 1 Oregon 5359 89.1 36.5
## 2 Washington 8583 89.0 32.4
## 3 California 36104 88.6 39.0
## 4 New York 2676 87.2 22.8
## 5 Idaho 190 86.6 20.8
## 6 Michigan 111 86.2 32.4
## 7 Virginia 770 85.6 27.0
wine %>%
filter(country == "US") %>%
filter(!is.na(price)) %>%
group_by(province, variety) %>%
summarize(count = n(),
max_price = max(price)) %>%
filter(count>100) %>%
arrange(desc(count))
## # A tibble: 64 x 4
## # Groups: province [5]
## province variety count max_price
## <chr> <chr> <int> <dbl>
## 1 California Pinot Noir 6875 155
## 2 California Cabernet Sauvignon 5668 625
## 3 California Chardonnay 5157 2013
## 4 Oregon Pinot Noir 2779 275
## 5 California Zinfandel 2633 100
## 6 California Syrah 1862 750
## 7 California Sauvignon Blanc 1801 75
## 8 California Red Blend 1791 290
## 9 California Merlot 1390 200
## 10 Washington Cabernet Sauvignon 1356 160
## # … with 54 more rows
wine %>%
filter(province == "Washington" | province == "Oregon" | province == "California") %>%
ggplot(aes(x=log(price), fill = province)) + geom_density(alpha = 0.4)
wine %>%
filter(province == "Washington" | province == "Oregon" | province == "California") %>%
ggplot(aes(x=log(price), fill = province)) + geom_density(alpha = 0.1)
Grouping by province shows the same data as…
wine %>%
group_by(province)
## # A tibble: 129,971 x 14
## # Groups: province [426]
## X1 country description designation points price province region_1 region_2
## <int> <chr> <chr> <chr> <int> <dbl> <chr> <chr> <chr>
## 1 0 Italy Aromas inc… Vulkà Bian… 87 NA Sicily … Etna <NA>
## 2 1 Portug… This is ri… Avidagos 87 15 Douro <NA> <NA>
## 3 2 US Tart and s… <NA> 87 14 Oregon Willame… Willame…
## 4 3 US Pineapple … Reserve La… 87 13 Michigan Lake Mi… <NA>
## 5 4 US Much like … Vintner's … 87 65 Oregon Willame… Willame…
## 6 5 Spain Blackberry… Ars In Vit… 87 15 Norther… Navarra <NA>
## 7 6 Italy Here's a b… Belsito 87 16 Sicily … Vittoria <NA>
## 8 7 France This dry a… <NA> 87 24 Alsace Alsace <NA>
## 9 8 Germany Savory dri… Shine 87 12 Rheinhe… <NA> <NA>
## 10 9 France This has g… Les Natures 87 27 Alsace Alsace <NA>
## # … with 129,961 more rows, and 5 more variables: taster_name <chr>,
## # taster_twitter_handle <chr>, title <chr>, variety <chr>, winery <chr>
wine %>%
group_by(province, variety)
## # A tibble: 129,971 x 14
## # Groups: province, variety [4,151]
## X1 country description designation points price province region_1 region_2
## <int> <chr> <chr> <chr> <int> <dbl> <chr> <chr> <chr>
## 1 0 Italy Aromas inc… Vulkà Bian… 87 NA Sicily … Etna <NA>
## 2 1 Portug… This is ri… Avidagos 87 15 Douro <NA> <NA>
## 3 2 US Tart and s… <NA> 87 14 Oregon Willame… Willame…
## 4 3 US Pineapple … Reserve La… 87 13 Michigan Lake Mi… <NA>
## 5 4 US Much like … Vintner's … 87 65 Oregon Willame… Willame…
## 6 5 Spain Blackberry… Ars In Vit… 87 15 Norther… Navarra <NA>
## 7 6 Italy Here's a b… Belsito 87 16 Sicily … Vittoria <NA>
## 8 7 France This dry a… <NA> 87 24 Alsace Alsace <NA>
## 9 8 Germany Savory dri… Shine 87 12 Rheinhe… <NA> <NA>
## 10 9 France This has g… Les Natures 87 27 Alsace Alsace <NA>
## # … with 129,961 more rows, and 5 more variables: taster_name <chr>,
## # taster_twitter_handle <chr>, title <chr>, variety <chr>, winery <chr>
wine %>%
group_by(country) %>%
summarize('avg. price'=mean(price)) %>%
head(5)
## # A tibble: 5 x 2
## country `avg. price`
## <chr> <dbl>
## 1 Argentina NA
## 2 Armenia 14.5
## 3 Australia NA
## 4 Austria NA
## 5 Bosnia and Herzegovina 12.5
wine %>%
filter(country=="Argentina") %>%
group_by(variety) %>%
summarize(
'avg_points' = mean(points)) %>%
top_n(5,avg_points) %>%
arrange(desc(avg_points))
## # A tibble: 5 x 2
## variety avg_points
## <chr> <dbl>
## 1 Malbec-Cabernet 91.7
## 2 Cabernet Franc-Malbec 91
## 3 Malbec-Petit Verdot 91
## 4 Tempranillo-Malbec 91
## 5 Syrah-Viognier 90.7
Note that by running summarize twice, I am back to the results from the previous slide.
wine %>%
filter(country=="Argentina") %>%
group_by(variety, winery) %>%
summarize(points = mean(points)) %>%
summarize('avg_points' = mean(points)) %>%
top_n(5,avg_points) %>%
arrange(desc(avg_points))
## # A tibble: 5 x 2
## variety avg_points
## <chr> <dbl>
## 1 Malbec-Cabernet 91.7
## 2 Cabernet Franc-Malbec 91
## 3 Malbec-Petit Verdot 91
## 4 Tempranillo-Malbec 91
## 5 Syrah-Viognier 90.7
wine %>%
filter(country=="Argentina") %>%
summarize(winery_count=n_distinct(winery))
## # A tibble: 1 x 1
## winery_count
## <int>
## 1 531
wine %>%
filter(country=="Argentina") %>%
count(winery)
## # A tibble: 531 x 2
## winery n
## * <chr> <int>
## 1 2 Copas 1
## 2 25 Lagunas 1
## 3 Achaval-Ferrer 18
## 4 Aconcagua 3
## 5 Aconga 5
## 6 Acordeón 6
## 7 Adoquin 2
## 8 Aguijón De Abeja 4
## 9 Aitor Ider Balbo 1
## 10 Alamos 24
## # … with 521 more rows
pop <- read_csv("/Users/Rose/Desktop/Masters In Data Science/Data Science with R/population.csv") %>%
pivot_longer(., names_to ="year", values_to = "population", 3:61) %>%
rename("country"="Country Name")
pop2017 <- pop %>%
filter(year==2017) %>%
select(country, population)
pop2017
## # A tibble: 263 x 2
## country population
## <chr> <dbl>
## 1 Aruba 105366
## 2 Afghanistan 36296400
## 3 Angola 29816748
## 4 Albania 2873457
## 5 Andorra 77001
## 6 Arab World 411898965
## 7 United Arab Emirates 9487203
## 8 Argentina 44044811
## 9 Armenia 2944809
## 10 American Samoa 55620
## # … with 253 more rows
wine_pop <- wine %>%
inner_join(pop2017) %>%
select(country, population, title)
wine_pop
## # A tibble: 75,316 x 3
## country population title
## <chr> <dbl> <chr>
## 1 Italy 60536709 Nicosia 2013 Vulkà Bianco (Etna)
## 2 Portugal 10300300 Quinta dos Avidagos 2011 Avidagos Red (Douro)
## 3 Spain 46593236 Tandem 2011 Ars In Vitro Tempranillo-Merlot (Navarra)
## 4 Italy 60536709 Terre di Giurfo 2013 Belsito Frappato (Vittoria)
## 5 France 66865144 Trimbach 2012 Gewurztraminer (Alsace)
## 6 Germany 82657002 Heinz Eifel 2013 Shine Gewürztraminer (Rheinhessen)
## 7 France 66865144 Jean-Baptiste Adam 2012 Les Natures Pinot Gris (Alsace)
## 8 France 66865144 Leon Beyer 2012 Gewurztraminer (Alsace)
## 9 Italy 60536709 Masseria Setteporte 2012 Rosso (Etna)
## 10 Germany 82657002 Richard Böcking 2013 Devon Riesling (Mosel)
## # … with 75,306 more rows
wine_pop <- wine %>%
left_join(pop2017) %>%
select(country, population, title)
wine_pop
## # A tibble: 129,971 x 3
## country population title
## <chr> <dbl> <chr>
## 1 Italy 60536709 Nicosia 2013 Vulkà Bianco (Etna)
## 2 Portugal 10300300 Quinta dos Avidagos 2011 Avidagos Red (Douro)
## 3 US NA Rainstorm 2013 Pinot Gris (Willamette Valley)
## 4 US NA St. Julian 2013 Reserve Late Harvest Riesling (Lake Mich…
## 5 US NA Sweet Cheeks 2012 Vintner's Reserve Wild Child Block Pin…
## 6 Spain 46593236 Tandem 2011 Ars In Vitro Tempranillo-Merlot (Navarra)
## 7 Italy 60536709 Terre di Giurfo 2013 Belsito Frappato (Vittoria)
## 8 France 66865144 Trimbach 2012 Gewurztraminer (Alsace)
## 9 Germany 82657002 Heinz Eifel 2013 Shine Gewürztraminer (Rheinhessen)
## 10 France 66865144 Jean-Baptiste Adam 2012 Les Natures Pinot Gris (Alsace)
## # … with 129,961 more rows
wine %>%
anti_join(pop2017)
## # A tibble: 54,655 x 14
## X1 country description designation points price province region_1 region_2
## <int> <chr> <chr> <chr> <int> <dbl> <chr> <chr> <chr>
## 1 2 US Tart and s… <NA> 87 14 Oregon Willame… Willame…
## 2 3 US Pineapple … Reserve La… 87 13 Michigan Lake Mi… <NA>
## 3 4 US Much like … Vintner's … 87 65 Oregon Willame… Willame…
## 4 10 US Soft, supp… Mountain C… 87 19 Califor… Napa Va… Napa
## 5 12 US Slightly r… <NA> 87 34 Califor… Alexand… Sonoma
## 6 14 US Building o… <NA> 87 12 Califor… Central… Central…
## 7 19 US Red fruit … <NA> 87 32 Virginia Virginia <NA>
## 8 20 US Ripe aroma… Vin de Mai… 87 23 Virginia Virginia <NA>
## 9 21 US A sleek mi… <NA> 87 20 Oregon Oregon Oregon …
## 10 23 US This wine … Signature … 87 22 Califor… Paso Ro… Central…
## # … with 54,645 more rows, and 5 more variables: taster_name <chr>,
## # taster_twitter_handle <chr>, title <chr>, variety <chr>, winery <chr>
The function mutate() from dplyr allows you to create new variables from existing ones.
wine %>%
mutate(lprice=log(price)) %>%
select(price, lprice)
## # A tibble: 129,971 x 2
## price lprice
## <dbl> <dbl>
## 1 NA NA
## 2 15 2.71
## 3 14 2.64
## 4 13 2.56
## 5 65 4.17
## 6 15 2.71
## 7 16 2.77
## 8 24 3.18
## 9 12 2.48
## 10 27 3.30
## # … with 129,961 more rows
You can also mutate by group. For instance, let’s say you wanted to standardize prices by variety
wine %>%
group_by(variety) %>%
mutate(npoints = (points-mean(points)) / sd(points)) %>%
filter(variety == "Riesling" | variety == "Cabernet Sauvignon") %>%
ggplot(aes(npoints, fill=variety))+
geom_density(alpha=0.3)
Let’s change that pesky ‘US’ identifier to ‘United States’
wine <- wine %>%
mutate(country=ifelse(country=="US","United States",country))
wine
## # A tibble: 129,971 x 14
## X1 country description designation points price province region_1 region_2
## <int> <chr> <chr> <chr> <int> <dbl> <chr> <chr> <chr>
## 1 0 Italy Aromas inc… Vulkà Bian… 87 NA Sicily … Etna <NA>
## 2 1 Portug… This is ri… Avidagos 87 15 Douro <NA> <NA>
## 3 2 United… Tart and s… <NA> 87 14 Oregon Willame… Willame…
## 4 3 United… Pineapple … Reserve La… 87 13 Michigan Lake Mi… <NA>
## 5 4 United… Much like … Vintner's … 87 65 Oregon Willame… Willame…
## 6 5 Spain Blackberry… Ars In Vit… 87 15 Norther… Navarra <NA>
## 7 6 Italy Here's a b… Belsito 87 16 Sicily … Vittoria <NA>
## 8 7 France This dry a… <NA> 87 24 Alsace Alsace <NA>
## 9 8 Germany Savory dri… Shine 87 12 Rheinhe… <NA> <NA>
## 10 9 France This has g… Les Natures 87 27 Alsace Alsace <NA>
## # … with 129,961 more rows, and 5 more variables: taster_name <chr>,
## # taster_twitter_handle <chr>, title <chr>, variety <chr>, winery <chr>
library(stringr)
name <- "Jameson Watts"
name
## [1] "Jameson Watts"
quote <- '"As soon as you stop wanting something, you get it." - Andy Warhol'
quote
## [1] "\"As soon as you stop wanting something, you get it.\" - Andy Warhol"
writeLines(quote)
## "As soon as you stop wanting something, you get it." - Andy Warhol
str_length(quote)
## [1] 66
name_quote <- str_c("Name: ", name, "\nQuote: ", quote)
writeLines(name_quote)
## Name: Jameson Watts
## Quote: "As soon as you stop wanting something, you get it." - Andy Warhol
str_sub(name_quote,10,30)
## [1] "eson Watts\nQuote: \"As"
str_to_lower(name_quote)
## [1] "name: jameson watts\nquote: \"as soon as you stop wanting something, you get it.\" - andy warhol"
str_locate(name_quote,"Jameson")
## start end
## [1,] 7 13
library(htmlwidgets)
str_detect(name_quote, "on Wa")
## [1] TRUE
str_detect(name_quote, "on wa")
## [1] FALSE
str_view_all(name_quote, "ou")
Placeholders and Repetition
Anchors
Character classes
str_view_all(name_quote, "a.+s")
wine %>%
filter(str_detect(variety,"[Cc]abernet")) %>%
select(points, price, variety) %>%
arrange(desc(price))
## # A tibble: 11,746 x 3
## points price variety
## <int> <dbl> <chr>
## 1 94 625 Cabernet Sauvignon
## 2 98 625 Cabernet Sauvignon
## 3 97 625 Cabernet Sauvignon
## 4 92 600 Tempranillo-Cabernet Sauvignon
## 5 93 500 Cabernet Sauvignon
## 6 89 500 Tempranillo-Cabernet Sauvignon
## 7 95 500 Cabernet Sauvignon
## 8 89 500 Tempranillo-Cabernet Sauvignon
## 9 92 400 Cabernet Sauvignon
## 10 96 400 Cabernet Sauvignon
## # … with 11,736 more rows
Note: these sorts of queries can tax your computer
wine %>%
filter(province=="Oregon") %>%
filter(str_detect(description,"[Ee]spresso") & str_detect(description,"[Bb]lack currant")) %>%
select(variety, price, points, winery) %>%
arrange(desc(points), price)
## # A tibble: 1 x 4
## variety price points winery
## <chr> <dbl> <int> <chr>
## 1 Pinot Noir 66 94 Shea
wine %>%
filter(province=="Oregon") %>%
mutate(fruity = str_detect(description,"[Ff]ruity")) %>%
ggplot(aes(points, fill=fruity))+
geom_density(alpha=.5)
wine <- wine %>%
mutate(year = str_extract(title,"(\\d{4})"))
wine %>%
select(title, year)
## # A tibble: 129,971 x 2
## title year
## <chr> <chr>
## 1 Nicosia 2013 Vulkà Bianco (Etna) 2013
## 2 Quinta dos Avidagos 2011 Avidagos Red (Douro) 2011
## 3 Rainstorm 2013 Pinot Gris (Willamette Valley) 2013
## 4 St. Julian 2013 Reserve Late Harvest Riesling (Lake Michigan Shore) 2013
## 5 Sweet Cheeks 2012 Vintner's Reserve Wild Child Block Pinot Noir (Willa… 2012
## 6 Tandem 2011 Ars In Vitro Tempranillo-Merlot (Navarra) 2011
## 7 Terre di Giurfo 2013 Belsito Frappato (Vittoria) 2013
## 8 Trimbach 2012 Gewurztraminer (Alsace) 2012
## 9 Heinz Eifel 2013 Shine Gewürztraminer (Rheinhessen) 2013
## 10 Jean-Baptiste Adam 2012 Les Natures Pinot Gris (Alsace) 2012
## # … with 129,961 more rows
wine %>%
filter(province=="Oregon") %>%
filter(variety=="Pinot Noir" | variety=="Chardonnay" | variety=="Pinot Gris") %>%
filter(year >= 1995) %>%
group_by(year, variety) %>%
summarize(avg_points = mean(points)) %>%
ggplot(aes(x=as.integer(year), y=avg_points, color=variety)) +
geom_line()
Now that we’ve extracted year from the title, we can do a join on both country and year
wine %>%
left_join(pop) %>%
filter(!is.na(population) & !is.na(year)) %>%
filter(population < 1000000000) %>%
group_by(country,year) %>%
summarize(population=mean(population),price = mean(price)) %>%
ggplot(aes(x=log(population), y=log(price))) +
geom_jitter(alpha=.1)+
geom_smooth(method = lm)
top_wineries <- wine %>%
group_by(winery) %>%
summarize(
avg_points=mean(points),
count=n()) %>%
filter(count > 10) %>%
top_frac(.05,avg_points) %>%
left_join(wine)
top_wineries
## # A tibble: 4,374 x 17
## winery avg_points count X1 country description designation points price
## <chr> <dbl> <int> <int> <chr> <chr> <chr> <int> <dbl>
## 1 Abeja 92.4 28 6061 United… All variet… Heather Hi… 89 55
## 2 Abeja 92.4 28 6737 United… Fresh and … <NA> 94 36
## 3 Abeja 92.4 28 28201 United… The wine i… <NA> 91 36
## 4 Abeja 92.4 28 33841 United… Sourced fr… Reserve 97 85
## 5 Abeja 92.4 28 33847 United… This 100% … <NA> 95 38
## 6 Abeja 92.4 28 40428 United… Made from … <NA> 92 40
## 7 Abeja 92.4 28 48216 United… This new v… <NA> 93 28
## 8 Abeja 92.4 28 54754 United… Abeja wine… <NA> 94 38
## 9 Abeja 92.4 28 54763 United… Abeja's Ch… <NA> 94 36
## 10 Abeja 92.4 28 54891 United… Walla Wall… Estate Gro… 90 30
## # … with 4,364 more rows, and 8 more variables: province <chr>, region_1 <chr>,
## # region_2 <chr>, taster_name <chr>, taster_twitter_handle <chr>,
## # title <chr>, variety <chr>, year <chr>
rainfall
## # A tibble: 48 x 13
## Year Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1970 13.5 4.46 1.92 2.63 1.36 0.85 0.01 NA 1.81 3.25 7.18 9.74
## 2 1971 6.49 4.34 6.93 4.05 1.89 2.47 0.01 1.49 3.98 3.09 6.27 8.18
## 3 1972 7.98 4.68 4.96 3.79 2.4 0.69 0.12 0.14 2.07 0.7 3.77 8.7
## 4 1973 5.64 1.62 3.5 1.69 1.11 1.48 NA 0.8 2.8 2.79 15.2 11.1
## 5 1974 10.9 5.56 7.95 1.48 0.9 0.41 1.8 0.11 0.28 2.15 7.42 6.94
## 6 1975 4.96 4.68 4.22 2.2 1.66 0.81 0.51 1.96 0 5.51 6.06 6.07
## 7 1976 5.47 6.92 3.66 2 1.33 1.04 0.67 1.89 1.13 1.51 1.13 1.26
## 8 1977 0.88 2.83 3.33 0.62 3.76 0.73 0.26 1.7 2.36 2.37 6.19 8.73
## 9 1978 5.67 3.54 1.23 3.5 2.97 0.48 1.07 2.56 2.64 0.37 4.5 2.64
## 10 1979 2.84 7.19 2.17 2.82 2.2 0.65 0.3 0.7 2.19 6.06 3.83 6.95
## # … with 38 more rows
rainfall %>%
rename("year" = "Year") %>%
pivot_longer(-year, names_to = "month", values_to = "rainfall")
## # A tibble: 576 x 3
## year month rainfall
## <int> <chr> <dbl>
## 1 1970 Jan 13.5
## 2 1970 Feb 4.46
## 3 1970 Mar 1.92
## 4 1970 Apr 2.63
## 5 1970 May 1.36
## 6 1970 Jun 0.85
## 7 1970 Jul 0.01
## 8 1970 Aug NA
## 9 1970 Sep 1.81
## 10 1970 Oct 3.25
## # … with 566 more rows
new_wine <-wine %>%
filter(!is.na(price)) %>%
filter(province == "Oregon") %>%
mutate(year = as.numeric(str_extract(title,"(\\d{4})"))) %>%
left_join(rainfall, by = c("year" = "Year")) %>%
pivot_longer(16:26, names_to = "month", values_to = "rainfall")
new_wine %>%
select(year, month, rainfall, title)
## # A tibble: 58,949 x 4
## year month rainfall title
## <dbl> <chr> <dbl> <chr>
## 1 2013 Jan 1.63 Rainstorm 2013 Pinot Gris (Willamette Valley)
## 2 2013 Feb 1.42 Rainstorm 2013 Pinot Gris (Willamette Valley)
## 3 2013 Mar 2.21 Rainstorm 2013 Pinot Gris (Willamette Valley)
## 4 2013 Apr 2.39 Rainstorm 2013 Pinot Gris (Willamette Valley)
## 5 2013 May 2.94 Rainstorm 2013 Pinot Gris (Willamette Valley)
## 6 2013 Jun 1.02 Rainstorm 2013 Pinot Gris (Willamette Valley)
## 7 2013 Jul 0 Rainstorm 2013 Pinot Gris (Willamette Valley)
## 8 2013 Aug 0.35 Rainstorm 2013 Pinot Gris (Willamette Valley)
## 9 2013 Sep 7.05 Rainstorm 2013 Pinot Gris (Willamette Valley)
## 10 2013 Oct 0.63 Rainstorm 2013 Pinot Gris (Willamette Valley)
## # … with 58,939 more rows
new_wine %>%
mutate(month_number =
case_when(month == "Jan" ~ 1,
month == "Feb" ~ 2,
month == "Mar" ~ 3,
month == "Apr" ~ 4,
month == "May" ~ 5,
month == "Jun" ~ 6,
month == "Jul" ~ 7,
month == "Aug" ~ 8,
month == "Sep" ~ 9,
month == "Oct" ~ 10,
month == "Nov" ~ 11,
month == "Dec" ~ 12,
))
## # A tibble: 58,949 x 19
## X1 country description designation points price province region_1 region_2
## <int> <chr> <chr> <chr> <int> <dbl> <chr> <chr> <chr>
## 1 2 United… Tart and s… <NA> 87 14 Oregon Willame… Willame…
## 2 2 United… Tart and s… <NA> 87 14 Oregon Willame… Willame…
## 3 2 United… Tart and s… <NA> 87 14 Oregon Willame… Willame…
## 4 2 United… Tart and s… <NA> 87 14 Oregon Willame… Willame…
## 5 2 United… Tart and s… <NA> 87 14 Oregon Willame… Willame…
## 6 2 United… Tart and s… <NA> 87 14 Oregon Willame… Willame…
## 7 2 United… Tart and s… <NA> 87 14 Oregon Willame… Willame…
## 8 2 United… Tart and s… <NA> 87 14 Oregon Willame… Willame…
## 9 2 United… Tart and s… <NA> 87 14 Oregon Willame… Willame…
## 10 2 United… Tart and s… <NA> 87 14 Oregon Willame… Willame…
## # … with 58,939 more rows, and 10 more variables: taster_name <chr>,
## # taster_twitter_handle <chr>, title <chr>, variety <chr>, winery <chr>,
## # year <dbl>, Dec <dbl>, month <chr>, rainfall <dbl>, month_number <dbl>
create “character” w/ values tart/spicy oregon wines w/ those terms in description density plot
new_wine <- new_wine %>%
pivot_wider(names_from = month, values_from = rainfall) %>%
mutate(character = case_when(
str_detect(description, "[Tt]art") ~ 'tart',
str_detect(description, "[Ss]icy") ~ 'spicy'))
new_wine %>%
filter(!is.na(character)) %>%
ggplot(aes(log(price), fill=character)) + geom_density(alpha =.7)
new_wine %>%
filter(year > 1995) %>%
filter(!is.na(character)) %>%
ggplot(aes(x=year, y = points, color = character)) + geom_jitter()
## Scatter plot example
new_wine %>%
filter(year>1995) %>%
filter(!is.na(character)) %>%
ggplot(aes(x=year, y=points, color=character)) +
geom_jitter()
new_wine %>%
filter(!is.na(character)) %>%
ggplot(aes(character)) +
geom_bar()
Plot the counts of each character of wine between 1995 and 2015
new_wine %>%
filter(year>1995 & year <= 2015) %>%
filter(!is.na(character)) %>%
group_by(year,character) %>%
summarise(char_cnt=n()) %>%
ggplot(aes(year,char_cnt, color=character))+
geom_line()
new_wine %>%
ggplot(aes(price))+
geom_histogram()
new_wine %>%
filter(!is.na(character)) %>%
ggplot(aes(character,log(price), color=character))+
geom_violin()
Use any of the techniques that you’ve learned thus far to answer the following:
Is there a relationship between rainfall and wine quality in Oregon?
rains <- rainfall %>%
rename("year"="Year") %>%
pivot_longer(-year,names_to = 'month',values_to = 'rainfall') %>%
mutate(rainfall=ifelse(is.na(rainfall),0,rainfall)) %>%
filter(month %in% c('May','Jun','Jul','Aug','Sep')) %>% #note the %in% operator
group_by(year) %>%
summarise(summer_rain=sum(rainfall))
new_wine <- new_wine %>%
filter(points > 88) %>%
group_by(year) %>%
summarize(avg_price=mean(price), avg_points=mean(points)) %>%
left_join(rains)
new_wine %>%
ggplot(aes(log(summer_rain),avg_points))+
geom_point()+
geom_smooth(method = lm)