Setup

knitr::opts_chunk$set(echo = TRUE, message=FALSE, warning = FALSE, error = FALSE)
library(htmlwidgets)
library(tidyverse)
library(stringr)
library(skimr)

rainfall <- read_csv("/Users/Rose/Desktop/Masters In Data Science/Data Science with R/rainfall.csv")

wine <- read_csv("/Users/Rose/Desktop/Masters In Data Science/Data Science with R/winemag-data.csv")
library(htmlwidgets)

What is the mean/median rating and cost of a bottle of red wine?

wine_red_example <- wine %>%
    mutate(red_wine = 
        case_when(str_detect(description, "[Rr]ed") ~ "Red")) %>%
    filter(!is.na(red_wine)) %>%
    select(red_wine, points, price)
head(wine)
## # A tibble: 6 x 14
##      X1 country description designation points price province region_1 region_2
##   <int> <chr>   <chr>       <chr>        <int> <dbl> <chr>    <chr>    <chr>   
## 1     0 Italy   Aromas inc… Vulkà Bian…     87    NA Sicily … Etna     <NA>    
## 2     1 Portug… This is ri… Avidagos        87    15 Douro    <NA>     <NA>    
## 3     2 US      Tart and s… <NA>            87    14 Oregon   Willame… Willame…
## 4     3 US      Pineapple … Reserve La…     87    13 Michigan Lake Mi… <NA>    
## 5     4 US      Much like … Vintner's …     87    65 Oregon   Willame… Willame…
## 6     5 Spain   Blackberry… Ars In Vit…     87    15 Norther… Navarra  <NA>    
## # … with 5 more variables: taster_name <chr>, taster_twitter_handle <chr>,
## #   title <chr>, variety <chr>, winery <chr>
wine %>%
    mutate(wine_type = 
        case_when(str_detect(description, "[Rr]ed") ~ "Red",
                  str_detect(description, "[Ww]hite") ~ "White")) %>%
    filter(!is.na(wine_type)) %>%
    ggplot(aes(x = wine_type, y = points)) +
  geom_boxplot() +
  coord_flip()

Is wine from the willamette valley more/less expensive then anywhere else?

#white_red_wine %>%
 #   filter(!is.na(white_wine) & !is.na(red_wine))
wine %>%
  filter(country == "France") %>%
  filter(price > 200) %>%
  select(points, price, variety) %>%
  arrange(desc(points), price, variety)
## # A tibble: 306 x 3
##    points price variety                   
##     <int> <dbl> <chr>                     
##  1    100   250 Champagne Blend           
##  2    100   259 Champagne Blend           
##  3    100   359 Bordeaux-style Red Blend  
##  4    100   617 Chardonnay                
##  5    100   848 Bordeaux-style White Blend
##  6    100  1500 Bordeaux-style Red Blend  
##  7    100  1500 Bordeaux-style Red Blend  
##  8     99   560 Chardonnay                
##  9     99   800 Chardonnay                
## 10     98   305 Champagne Blend           
## # … with 296 more rows
glimpse(wine)
## Rows: 129,971
## Columns: 14
## $ X1                    <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1…
## $ country               <chr> "Italy", "Portugal", "US", "US", "US", "Spain",…
## $ description           <chr> "Aromas include tropical fruit, broom, brimston…
## $ designation           <chr> "Vulkà Bianco", "Avidagos", NA, "Reserve Late H…
## $ points                <int> 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87,…
## $ price                 <dbl> NA, 15, 14, 13, 65, 15, 16, 24, 12, 27, 19, 30,…
## $ province              <chr> "Sicily & Sardinia", "Douro", "Oregon", "Michig…
## $ region_1              <chr> "Etna", NA, "Willamette Valley", "Lake Michigan…
## $ region_2              <chr> NA, NA, "Willamette Valley", NA, "Willamette Va…
## $ taster_name           <chr> "Kerin O’Keefe", "Roger Voss", "Paul Gregutt", …
## $ taster_twitter_handle <chr> "@kerinokeefe", "@vossroger", "@paulgwine ", NA…
## $ title                 <chr> "Nicosia 2013 Vulkà Bianco  (Etna)", "Quinta do…
## $ variety               <chr> "White Blend", "Portuguese Red", "Pinot Gris", …
## $ winery                <chr> "Nicosia", "Quinta dos Avidagos", "Rainstorm", …
skim(wine)
## Skim summary statistics
##  n obs: 129971 
##  n variables: 14 
##  group variables:  
## 
## ── Variable type:character ─────────────────────────────────────────────────────
##               variable missing complete      n min max empty n_unique
##                country      63   129908 129971   2  22     0       43
##            description       0   129971 129971  20 829     0   119955
##            designation   37465    92506 129971   1  95     0    37979
##               province      63   129908 129971   3  31     0      425
##               region_1   21247   108724 129971   3  50     0     1229
##               region_2   79460    50511 129971   4  17     0       17
##            taster_name   26244   103727 129971  10  18     0       19
##  taster_twitter_handle   31213    98758 129971   6  16     0       15
##                  title       0   129971 129971  12 136     0   118840
##                variety       1   129970 129971   4  35     0      707
##                 winery       0   129971 129971   1  54     0    16757
## 
## ── Variable type:integer ───────────────────────────────────────────────────────
##  variable missing complete      n     mean       sd p0     p25   p50     p75
##    points       0   129971 129971    88.45     3.04 80    86      88    91  
##        X1       0   129971 129971 64985    37519.54  0 32492.5 64985 97477.5
##    p100     hist
##     100 ▁▃▅▇▃▂▁▁
##  129970 ▇▇▇▇▇▇▇▇
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────
##  variable missing complete      n  mean    sd p0 p25 p50 p75 p100     hist
##     price    8996   120975 129971 35.36 41.02  4  17  25  42 3300 ▇▁▁▁▁▁▁▁

Some Rad Markdown

Note: Using markdown is an easy way to marry analysis…

The Advantages are: 1. Simple 2. Reproducable 3. Easy for others to follow

Example analysis:

my_var <- exp(log(42))

Note2: it’s also cool bc look: the value of my_var is 42

wine %>%
    glimpse()
## Rows: 129,971
## Columns: 14
## $ X1                    <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1…
## $ country               <chr> "Italy", "Portugal", "US", "US", "US", "Spain",…
## $ description           <chr> "Aromas include tropical fruit, broom, brimston…
## $ designation           <chr> "Vulkà Bianco", "Avidagos", NA, "Reserve Late H…
## $ points                <int> 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87,…
## $ price                 <dbl> NA, 15, 14, 13, 65, 15, 16, 24, 12, 27, 19, 30,…
## $ province              <chr> "Sicily & Sardinia", "Douro", "Oregon", "Michig…
## $ region_1              <chr> "Etna", NA, "Willamette Valley", "Lake Michigan…
## $ region_2              <chr> NA, NA, "Willamette Valley", NA, "Willamette Va…
## $ taster_name           <chr> "Kerin O’Keefe", "Roger Voss", "Paul Gregutt", …
## $ taster_twitter_handle <chr> "@kerinokeefe", "@vossroger", "@paulgwine ", NA…
## $ title                 <chr> "Nicosia 2013 Vulkà Bianco  (Etna)", "Quinta do…
## $ variety               <chr> "White Blend", "Portuguese Red", "Pinot Gris", …
## $ winery                <chr> "Nicosia", "Quinta dos Avidagos", "Rainstorm", …
wine %>%
    filter(variety == "Pinot Gris" & price < 10) %>%
    select(points, price, variety) %>%
    arrange(desc(points), price)
## # A tibble: 7 x 3
##   points price variety   
##    <int> <dbl> <chr>     
## 1     90     9 Pinot Gris
## 2     88     9 Pinot Gris
## 3     86     9 Pinot Gris
## 4     86     9 Pinot Gris
## 5     86     9 Pinot Gris
## 6     84     9 Pinot Gris
## 7     83     9 Pinot Gris
wine %>%
    filter(variety == "Chardonnay" & province == "Oregon") %>%
    select(points, price, variety) %>%
    skim()
## Skim summary statistics
##  n obs: 498 
##  n variables: 3 
##  group variables:  
## 
## ── Variable type:character ─────────────────────────────────────────────────────
##  variable missing complete   n min max empty n_unique
##   variety       0      498 498  10  10     0        1
## 
## ── Variable type:integer ───────────────────────────────────────────────────────
##  variable missing complete   n  mean   sd p0 p25 p50 p75 p100     hist
##    points       0      498 498 89.72 2.92 80  88  90  92   96 ▁▁▂▇▆▇▃▁
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────
##  variable missing complete   n  mean    sd p0 p25 p50   p75 p100     hist
##     price       0      498 498 34.94 18.92  7  22  30 44.25  125 ▅▇▃▂▁▁▁▁

Long Excercise * Suggest some Oregon wines in the region of + Chehalem Mountains

wine %>%
    filter(str_detect(region_1, "[Cc]hehalem Mountains"))
## # A tibble: 334 x 14
##       X1 country description designation points price province region_1 region_2
##    <int> <chr>   <chr>       <chr>        <int> <dbl> <chr>    <chr>    <chr>   
##  1   523 US      There's a … Steve's Re…     91    65 Oregon   Chehale… Willame…
##  2   609 US      Grüner Vel… Estate          87    20 Oregon   Chehale… Willame…
##  3   791 US      Just one b… Chehalem M…     90    85 Oregon   Chehale… Willame…
##  4   867 US      Fresh frui… Estate          90    40 Oregon   Chehale… Willame…
##  5  1092 US      From vines… Old Vines       94    50 Oregon   Chehale… Willame…
##  6  1514 US      The myster… Inchinnan       92    54 Oregon   Chehale… Willame…
##  7  2386 US      This pleas… <NA>            87    25 Oregon   Chehale… Willame…
##  8  2429 US      The Dion i… Dion            85    50 Oregon   Chehale… Willame…
##  9  2430 US      It's diffi… Dion Viney…     85    14 Oregon   Chehale… Willame…
## 10  3232 US      A blend of… Nuages          88    35 Oregon   Chehale… Willame…
## # … with 324 more rows, and 5 more variables: taster_name <chr>,
## #   taster_twitter_handle <chr>, title <chr>, variety <chr>, winery <chr>
wine %>%
  filter(str_detect(region_1, "[Cc]hehalem Mountains")) %>% 
  select(points, price, variety, winery) %>%
  arrange(desc(points), desc(price))
## # A tibble: 334 x 4
##    points price variety    winery     
##     <int> <dbl> <chr>      <chr>      
##  1     96    85 Pinot Noir Alloro     
##  2     95    90 Pinot Noir Adelsheim  
##  3     95    65 Pinot Noir Bergström  
##  4     95    45 Pinot Noir Alloro     
##  5     94    90 Pinot Noir Adelsheim  
##  6     94    85 Pinot Noir Alloro     
##  7     94    80 Pinot Noir Le Cadeau  
##  8     94    65 Chardonnay Lachini    
##  9     94    65 Pinot Noir Bergström  
## 10     94    65 Pinot Noir Arbor Brook
## # … with 324 more rows
Chehalem_mts <- wine %>%
  filter(str_detect(region_1, "[Cc]hehalem Mountains")) %>% 
  select(points, price, variety, winery)
Chehalem_mts %>%
  group_by(winery) %>%
  summarize('avg_points' = mean(points)) %>%
  arrange(desc(avg_points)) %>%
  top_n(5, avg_points)
## # A tibble: 5 x 2
##   winery                 avg_points
##   <chr>                       <dbl>
## 1 Domaine Divio                94  
## 2 Alloro                       93.2
## 3 Aberrant Cellars             93  
## 4 Timothy Malone               92.7
## 5 Patricia Green Cellars       92.5
Chehalem_mts %>%
  group_by(winery) %>%
  summarize('avg_points' = mean(points)) %>%
  arrange(desc(avg_points)) %>%
  top_n(5, avg_points)
## # A tibble: 5 x 2
##   winery                 avg_points
##   <chr>                       <dbl>
## 1 Domaine Divio                94  
## 2 Alloro                       93.2
## 3 Aberrant Cellars             93  
## 4 Timothy Malone               92.7
## 5 Patricia Green Cellars       92.5

now lets investigate “Domaine Divio”

Chehalem_mts %>%
    filter(winery == "Domaine Divio" | winery == "Alloro" | winery == "Aberrant Cellars") %>%
    filter(price < 50) %>%
    arrange(desc(points), desc(price), variety, winery)
## # A tibble: 11 x 4
##    points price variety    winery
##     <int> <dbl> <chr>      <chr> 
##  1     95    45 Pinot Noir Alloro
##  2     94    45 Pinot Noir Alloro
##  3     94    45 Pinot Noir Alloro
##  4     93    35 Pinot Noir Alloro
##  5     93    34 Chardonnay Alloro
##  6     93    34 Chardonnay Alloro
##  7     93    28 Chardonnay Alloro
##  8     92    45 Pinot Noir Alloro
##  9     92    35 Pinot Noir Alloro
## 10     92    28 Chardonnay Alloro
## 11     91    35 Pinot Noir Alloro
Chehalem_mts %>%
    filter(winery == "Domaine Divio" | winery == "Alloro" | winery == "Aberrant Cellars") %>%
    filter(price < 50) %>%
    count(str_detect(variety, "[Pp]inot Noir"))
## # A tibble: 2 x 2
##   `str_detect(variety, "[Pp]inot Noir")`     n
## * <lgl>                                  <int>
## 1 FALSE                                      4
## 2 TRUE                                       7
Chehalem_mts %>%
    filter(winery == "Timothy Malone" | winery == "Aberrant Cellars" | winery == "Patricia Green Cellars") %>%
arrange(desc(winery), desc(points), price, variety)
## # A tibble: 7 x 4
##   points price variety    winery                
##    <int> <dbl> <chr>      <chr>                 
## 1     94    20 Riesling   Timothy Malone        
## 2     93    20 Riesling   Timothy Malone        
## 3     91    37 Pinot Noir Timothy Malone        
## 4     93    48 Pinot Noir Patricia Green Cellars
## 5     92    48 Pinot Noir Patricia Green Cellars
## 6     93    50 Pinot Noir Aberrant Cellars      
## 7     93    50 Pinot Noir Aberrant Cellars
Chehalem_mts %>%
    filter(winery == "Timothy Malone" | winery == "Aberrant Cellars" | winery == "Patricia Green      Cellars") %>%
    filter(price <= 50 & points >= 90) %>%
    ggplot(aes(x = winery, fill = variety)) + geom_bar()

ggplot(aes(character)) + geom_bar()

Chehalem_mts %>%
    filter(winery == "Alloro" | winery == "Aberrant Cellars" | winery == "Patricia Green      Cellars") %>%
    filter(price <= 50 & points >= 90) %>%
    ggplot(aes(x = price, y = points, color = variety)) + geom_point()

Chehalem_mts %>%
    filter(variety == "Pinot Noir" | variety == "Chardonnay" | variety == "Riesling") %>%
    filter(price <= 50, points >= 92 & points <= 95) %>%
    ggplot(aes(x = price, y = points, color = variety)) + geom_point()

wine %>%
  filter(country == "France" | country == "Italy") %>%
  top_n(10, price) %>%
  select(points, price, country, title) %>%
  arrange(desc(points))
## # A tibble: 10 x 4
##    points price country title                                         
##     <int> <dbl> <chr>   <chr>                                         
##  1    100  1500 France  Château Lafite Rothschild 2010  Pauillac      
##  2    100  1500 France  Château Cheval Blanc 2010  Saint-Émilion      
##  3     98  1900 France  Château Margaux 2009  Margaux                 
##  4     97  2000 France  Château Pétrus 2011  Pomerol                  
##  5     96  1200 France  Château Haut-Brion 2009  Pessac-Léognan       
##  6     96  1300 France  Château Mouton Rothschild 2009  Pauillac      
##  7     96  2500 France  Château Pétrus 2014  Pomerol                  
##  8     96  2500 France  Domaine du Comte Liger-Belair 2010  La Romanée
##  9     96  2000 France  Domaine du Comte Liger-Belair 2005  La Romanée
## 10     88  3300 France  Château les Ormes Sorbet 2013  Médoc
wine %>%
  filter(province == "Oregon" & variety != "Chardonnay") %>%
  top_n(5, points) %>%
  select(points, price, title) %>%
  arrange(price)
## # A tibble: 7 x 3
##   points price title                                                            
##    <int> <dbl> <chr>                                                            
## 1     97    65 Ken Wright 2012 Abbott Claim Vineyard Pinot Noir                 
## 2     99    75 Cayuse 2009 En Chamberlin Vineyard Syrah (Walla Walla Valley (OR…
## 3     99    75 Cayuse 2011 En Chamberlin Vineyard Syrah (Walla Walla Valley (OR…
## 4     98    75 Cayuse 2011 En Cerise Vineyard Syrah (Walla Walla Valley (OR))   
## 5     97    80 Cayuse 2009 The Widowmaker Cabernet Sauvignon (Walla Walla Valle…
## 6     97    85 Cayuse 2009 Armada Vineyard Syrah (Walla Walla Valley (OR))      
## 7     97    90 Cayuse 2011 Widowmaker En Chamberlin Vineyard Cabernet Sauvignon…
wine %>%
    filter(province == "Oregon") %>%
    filter(variety != "Pinot Noir" & variety != "Chardonnay") %>%
    top_frac(.05, points) %>%
    select(points, price, title) %>%
    arrange(desc(points), price)
## # A tibble: 131 x 3
##    points price title                                                           
##     <int> <dbl> <chr>                                                           
##  1     99    75 Cayuse 2009 En Chamberlin Vineyard Syrah (Walla Walla Valley (O…
##  2     99    75 Cayuse 2011 En Chamberlin Vineyard Syrah (Walla Walla Valley (O…
##  3     98    75 Cayuse 2011 En Cerise Vineyard Syrah (Walla Walla Valley (OR))  
##  4     97    80 Cayuse 2009 The Widowmaker Cabernet Sauvignon (Walla Walla Vall…
##  5     97    85 Cayuse 2009 Armada Vineyard Syrah (Walla Walla Valley (OR))     
##  6     97    90 Cayuse 2011 Widowmaker En Chamberlin Vineyard Cabernet Sauvigno…
##  7     96    32 Trisaetum 2016 Ribbon Ridge Estate Dry Riesling (Ribbon Ridge)  
##  8     96    38 Trisaetum 2015 Estates Reserve Riesling (Willamette Valley)     
##  9     96    70 Cayuse 2012 Cailloux Vineyard Viognier (Walla Walla Valley (OR))
## 10     96    75 Cayuse 2009 Camaspelo Cabernet Sauvignon-Merlot (Walla Walla Va…
## # … with 121 more rows

Pivot_wider() and Pivot_longer()

wine_country <- wine %>%
    filter(variety == "Cabernet Sauvignon" | variety == "Chardonnay" | variety == "Pinot Gris" | variety == "Syrah") %>%
  group_by(country, variety) %>%
  summarize(points = mean(points)) %>%
  arrange(country)
wine_country
## # A tibble: 90 x 3
## # Groups:   country [32]
##    country   variety            points
##    <chr>     <chr>               <dbl>
##  1 Argentina Cabernet Sauvignon   86.0
##  2 Argentina Chardonnay           84.9
##  3 Argentina Pinot Gris           84.9
##  4 Argentina Syrah                85.8
##  5 Australia Cabernet Sauvignon   89.3
##  6 Australia Chardonnay           87.3
##  7 Australia Pinot Gris           87.4
##  8 Australia Syrah                91.6
##  9 Austria   Cabernet Sauvignon   87.4
## 10 Austria   Chardonnay           90.3
## # … with 80 more rows

now lets pivot_wider it out so that we’ve got one column for each variety of wine

wide_wine <- wine_country %>%
  pivot_wider(variety, points)
wide_wine
## # A tibble: 4 x 1
##   variety           
##   <chr>             
## 1 Cabernet Sauvignon
## 2 Chardonnay        
## 3 Pinot Gris        
## 4 Syrah

now if we want to pivot_longer it back to how it was

wine_long <- wide_wine %>%
  pivot_longer("variety", "points", 2:5)

wine_long
## # A tibble: 4 x 2
##   points  value             
##   <chr>   <chr>             
## 1 variety Cabernet Sauvignon
## 2 variety Chardonnay        
## 3 variety Pinot Gris        
## 4 variety Syrah

filter and count to figure out which country has more chardonnay, us or france?

wine %>%
  filter(country == "US" | country == "France") %>%
  filter(variety == "Chardonnay") %>%
  count(country)
## # A tibble: 2 x 2
##   country     n
## * <chr>   <int>
## 1 France   2808
## 2 US       6801
wine %>%
  filter(country == "US" | country == "France") %>%
  filter(variety == "Chardonnay") %>%
  ggplot(aes(x = country)) + geom_bar() 

Stacked bar chart

wine %>%
    filter(province == "Washington" | province == "Oregon" | province == "California") %>%
    filter(variety == "Cabernet Sauvignon" | variety == "Syrah" | variety == "Pinot Noir") %>%
    ggplot(aes(x = province, fill = variety)) + geom_bar()

wine %>%
    filter(province == "Washington" | province == "Oregon" | province == "California") %>%
    filter(points >= 97) %>%
    ggplot(aes(x = province, fill = variety)) + geom_bar() + scale_fill_manual(values=c("#CC0000", "#006600", "#669999", "#00CCCC", 
                             "#660099", "#CC0066", "#FF9999", "#FF9900", 
                             "blue", "yellow", "blue2", "grey", "grey2"))

Can use the “Summarize function” for calculations like mean, median, variance, min/max

wine %>%
    summarize(avg_points = mean(points))
## # A tibble: 1 x 1
##   avg_points
##        <dbl>
## 1       88.4
wine %>%
  ggplot(aes(x = points)) + geom_histogram()

wine %>%
    ggplot(aes(x = points, y = log(price))) +geom_point()

Combining discrete and continuous data

  • lets say we want to summarize by category
wine %>%
    filter(country == "US") %>%
    filter(!is.na(price)) %>%
    group_by(province) %>%
    summarize(
      count = n(), 
    average_points = mean(points), 
    average_price = mean(price)) %>%
    filter(count > 100) %>%
    arrange(desc(average_points))
## # A tibble: 7 x 4
##   province   count average_points average_price
##   <chr>      <int>          <dbl>         <dbl>
## 1 Oregon      5359           89.1          36.5
## 2 Washington  8583           89.0          32.4
## 3 California 36104           88.6          39.0
## 4 New York    2676           87.2          22.8
## 5 Idaho        190           86.6          20.8
## 6 Michigan     111           86.2          32.4
## 7 Virginia     770           85.6          27.0

US wines

wine %>%
    filter(country == "US") %>%
    filter(!is.na(price)) %>%
    group_by(province, variety) %>%
    summarize(count = n(),
              max_price = max(price)) %>%
    filter(count>100) %>%
    arrange(desc(count))
## # A tibble: 64 x 4
## # Groups:   province [5]
##    province   variety            count max_price
##    <chr>      <chr>              <int>     <dbl>
##  1 California Pinot Noir          6875       155
##  2 California Cabernet Sauvignon  5668       625
##  3 California Chardonnay          5157      2013
##  4 Oregon     Pinot Noir          2779       275
##  5 California Zinfandel           2633       100
##  6 California Syrah               1862       750
##  7 California Sauvignon Blanc     1801        75
##  8 California Red Blend           1791       290
##  9 California Merlot              1390       200
## 10 Washington Cabernet Sauvignon  1356       160
## # … with 54 more rows

density funciton

wine %>%
    filter(province == "Washington" | province == "Oregon" | province == "California") %>%
    ggplot(aes(x=log(price), fill = province)) + geom_density(alpha = 0.4)

wine %>%
    filter(province == "Washington" | province == "Oregon" | province == "California") %>%
    ggplot(aes(x=log(price), fill = province)) + geom_density(alpha = 0.1)

Grouping by province shows the same data as…

wine %>% 
  group_by(province)
## # A tibble: 129,971 x 14
## # Groups:   province [426]
##       X1 country description designation points price province region_1 region_2
##    <int> <chr>   <chr>       <chr>        <int> <dbl> <chr>    <chr>    <chr>   
##  1     0 Italy   Aromas inc… Vulkà Bian…     87    NA Sicily … Etna     <NA>    
##  2     1 Portug… This is ri… Avidagos        87    15 Douro    <NA>     <NA>    
##  3     2 US      Tart and s… <NA>            87    14 Oregon   Willame… Willame…
##  4     3 US      Pineapple … Reserve La…     87    13 Michigan Lake Mi… <NA>    
##  5     4 US      Much like … Vintner's …     87    65 Oregon   Willame… Willame…
##  6     5 Spain   Blackberry… Ars In Vit…     87    15 Norther… Navarra  <NA>    
##  7     6 Italy   Here's a b… Belsito         87    16 Sicily … Vittoria <NA>    
##  8     7 France  This dry a… <NA>            87    24 Alsace   Alsace   <NA>    
##  9     8 Germany Savory dri… Shine           87    12 Rheinhe… <NA>     <NA>    
## 10     9 France  This has g… Les Natures     87    27 Alsace   Alsace   <NA>    
## # … with 129,961 more rows, and 5 more variables: taster_name <chr>,
## #   taster_twitter_handle <chr>, title <chr>, variety <chr>, winery <chr>
wine %>% 
  group_by(province, variety)
## # A tibble: 129,971 x 14
## # Groups:   province, variety [4,151]
##       X1 country description designation points price province region_1 region_2
##    <int> <chr>   <chr>       <chr>        <int> <dbl> <chr>    <chr>    <chr>   
##  1     0 Italy   Aromas inc… Vulkà Bian…     87    NA Sicily … Etna     <NA>    
##  2     1 Portug… This is ri… Avidagos        87    15 Douro    <NA>     <NA>    
##  3     2 US      Tart and s… <NA>            87    14 Oregon   Willame… Willame…
##  4     3 US      Pineapple … Reserve La…     87    13 Michigan Lake Mi… <NA>    
##  5     4 US      Much like … Vintner's …     87    65 Oregon   Willame… Willame…
##  6     5 Spain   Blackberry… Ars In Vit…     87    15 Norther… Navarra  <NA>    
##  7     6 Italy   Here's a b… Belsito         87    16 Sicily … Vittoria <NA>    
##  8     7 France  This dry a… <NA>            87    24 Alsace   Alsace   <NA>    
##  9     8 Germany Savory dri… Shine           87    12 Rheinhe… <NA>     <NA>    
## 10     9 France  This has g… Les Natures     87    27 Alsace   Alsace   <NA>    
## # … with 129,961 more rows, and 5 more variables: taster_name <chr>,
## #   taster_twitter_handle <chr>, title <chr>, variety <chr>, winery <chr>
wine %>% 
  group_by(country) %>% 
  summarize('avg. price'=mean(price)) %>% 
  head(5)
## # A tibble: 5 x 2
##   country                `avg. price`
##   <chr>                         <dbl>
## 1 Argentina                      NA  
## 2 Armenia                        14.5
## 3 Australia                      NA  
## 4 Austria                        NA  
## 5 Bosnia and Herzegovina         12.5

What are the top five varieties in Argentinia by points?

wine %>% 
  filter(country=="Argentina") %>% 
  group_by(variety) %>% 
  summarize(
    'avg_points' = mean(points)) %>% 
  top_n(5,avg_points) %>% 
  arrange(desc(avg_points))
## # A tibble: 5 x 2
##   variety               avg_points
##   <chr>                      <dbl>
## 1 Malbec-Cabernet             91.7
## 2 Cabernet Franc-Malbec       91  
## 3 Malbec-Petit Verdot         91  
## 4 Tempranillo-Malbec          91  
## 5 Syrah-Viognier              90.7

Advanced summarize()

Each call to summarize removes one level of grouping.

Note that by running summarize twice, I am back to the results from the previous slide.

wine %>% 
  filter(country=="Argentina") %>% 
  group_by(variety, winery) %>% 
  summarize(points = mean(points)) %>% 
  summarize('avg_points' = mean(points)) %>% 
  top_n(5,avg_points) %>% 
  arrange(desc(avg_points))
## # A tibble: 5 x 2
##   variety               avg_points
##   <chr>                      <dbl>
## 1 Malbec-Cabernet             91.7
## 2 Cabernet Franc-Malbec       91  
## 3 Malbec-Petit Verdot         91  
## 4 Tempranillo-Malbec          91  
## 5 Syrah-Viognier              90.7

Exercise

  1. If needed, open the help file on dplyr::summarize()
  2. use summarize() with n_distinct()
  3. to find the number of unique wineries in Argentina

Solution

wine %>% 
  filter(country=="Argentina") %>% 
  summarize(winery_count=n_distinct(winery))
## # A tibble: 1 x 1
##   winery_count
##          <int>
## 1          531
wine %>% 
  filter(country=="Argentina") %>% 
  count(winery)
## # A tibble: 531 x 2
##    winery               n
##  * <chr>            <int>
##  1 2 Copas              1
##  2 25 Lagunas           1
##  3 Achaval-Ferrer      18
##  4 Aconcagua            3
##  5 Aconga               5
##  6 Acordeón             6
##  7 Adoquin              2
##  8 Aguijón De Abeja     4
##  9 Aitor Ider Balbo     1
## 10 Alamos              24
## # … with 521 more rows

Definitions

Primary keys: uniquely identifies row in its own dataframe

Foreign keys: uniquely identifies row in another dataframe

Let’s pivot longer, then start with the most recent year

pop <- read_csv("/Users/Rose/Desktop/Masters In Data Science/Data Science with R/population.csv") %>% 
  pivot_longer(., names_to ="year", values_to = "population", 3:61) %>% 
  rename("country"="Country Name") 

  
pop2017 <- pop %>% 
  filter(year==2017) %>% 
  select(country, population)
pop2017
## # A tibble: 263 x 2
##    country              population
##    <chr>                     <dbl>
##  1 Aruba                    105366
##  2 Afghanistan            36296400
##  3 Angola                 29816748
##  4 Albania                 2873457
##  5 Andorra                   77001
##  6 Arab World            411898965
##  7 United Arab Emirates    9487203
##  8 Argentina              44044811
##  9 Armenia                 2944809
## 10 American Samoa            55620
## # … with 253 more rows

Try joining with wine on country

  • Inner_join
wine_pop <- wine %>% 
  inner_join(pop2017) %>% 
  select(country, population, title)

wine_pop
## # A tibble: 75,316 x 3
##    country  population title                                                  
##    <chr>         <dbl> <chr>                                                  
##  1 Italy      60536709 Nicosia 2013 Vulkà Bianco  (Etna)                      
##  2 Portugal   10300300 Quinta dos Avidagos 2011 Avidagos Red (Douro)          
##  3 Spain      46593236 Tandem 2011 Ars In Vitro Tempranillo-Merlot (Navarra)  
##  4 Italy      60536709 Terre di Giurfo 2013 Belsito Frappato (Vittoria)       
##  5 France     66865144 Trimbach 2012 Gewurztraminer (Alsace)                  
##  6 Germany    82657002 Heinz Eifel 2013 Shine Gewürztraminer (Rheinhessen)    
##  7 France     66865144 Jean-Baptiste Adam 2012 Les Natures Pinot Gris (Alsace)
##  8 France     66865144 Leon Beyer 2012 Gewurztraminer (Alsace)                
##  9 Italy      60536709 Masseria Setteporte 2012 Rosso  (Etna)                 
## 10 Germany    82657002 Richard Böcking 2013 Devon Riesling (Mosel)            
## # … with 75,306 more rows
  • left_join
wine_pop <- wine %>% 
  left_join(pop2017) %>% 
  select(country, population, title)
wine_pop
## # A tibble: 129,971 x 3
##    country  population title                                                    
##    <chr>         <dbl> <chr>                                                    
##  1 Italy      60536709 Nicosia 2013 Vulkà Bianco  (Etna)                        
##  2 Portugal   10300300 Quinta dos Avidagos 2011 Avidagos Red (Douro)            
##  3 US               NA Rainstorm 2013 Pinot Gris (Willamette Valley)            
##  4 US               NA St. Julian 2013 Reserve Late Harvest Riesling (Lake Mich…
##  5 US               NA Sweet Cheeks 2012 Vintner's Reserve Wild Child Block Pin…
##  6 Spain      46593236 Tandem 2011 Ars In Vitro Tempranillo-Merlot (Navarra)    
##  7 Italy      60536709 Terre di Giurfo 2013 Belsito Frappato (Vittoria)         
##  8 France     66865144 Trimbach 2012 Gewurztraminer (Alsace)                    
##  9 Germany    82657002 Heinz Eifel 2013 Shine Gewürztraminer (Rheinhessen)      
## 10 France     66865144 Jean-Baptiste Adam 2012 Les Natures Pinot Gris (Alsace)  
## # … with 129,961 more rows

Types of joins

  • Mutating
    • inner_join()
    • left_join()
    • right_join()
    • full_join()
  • Filtering
    • semi_join()
    • anti_join()

Anti_join

wine %>% 
  anti_join(pop2017) 
## # A tibble: 54,655 x 14
##       X1 country description designation points price province region_1 region_2
##    <int> <chr>   <chr>       <chr>        <int> <dbl> <chr>    <chr>    <chr>   
##  1     2 US      Tart and s… <NA>            87    14 Oregon   Willame… Willame…
##  2     3 US      Pineapple … Reserve La…     87    13 Michigan Lake Mi… <NA>    
##  3     4 US      Much like … Vintner's …     87    65 Oregon   Willame… Willame…
##  4    10 US      Soft, supp… Mountain C…     87    19 Califor… Napa Va… Napa    
##  5    12 US      Slightly r… <NA>            87    34 Califor… Alexand… Sonoma  
##  6    14 US      Building o… <NA>            87    12 Califor… Central… Central…
##  7    19 US      Red fruit … <NA>            87    32 Virginia Virginia <NA>    
##  8    20 US      Ripe aroma… Vin de Mai…     87    23 Virginia Virginia <NA>    
##  9    21 US      A sleek mi… <NA>            87    20 Oregon   Oregon   Oregon …
## 10    23 US      This wine … Signature …     87    22 Califor… Paso Ro… Central…
## # … with 54,645 more rows, and 5 more variables: taster_name <chr>,
## #   taster_twitter_handle <chr>, title <chr>, variety <chr>, winery <chr>

Mutation for new variables

The function mutate() from dplyr allows you to create new variables from existing ones.

wine %>% 
  mutate(lprice=log(price)) %>% 
  select(price, lprice)
## # A tibble: 129,971 x 2
##    price lprice
##    <dbl>  <dbl>
##  1    NA  NA   
##  2    15   2.71
##  3    14   2.64
##  4    13   2.56
##  5    65   4.17
##  6    15   2.71
##  7    16   2.77
##  8    24   3.18
##  9    12   2.48
## 10    27   3.30
## # … with 129,961 more rows

You can also mutate by group. For instance, let’s say you wanted to standardize prices by variety

wine %>% 
  group_by(variety) %>% 
  mutate(npoints = (points-mean(points)) / sd(points)) %>%
  filter(variety == "Riesling" | variety == "Cabernet Sauvignon") %>% 
  ggplot(aes(npoints, fill=variety))+
    geom_density(alpha=0.3)

Conditional mutation

Let’s change that pesky ‘US’ identifier to ‘United States’

wine <- wine %>%
  mutate(country=ifelse(country=="US","United States",country))
wine
## # A tibble: 129,971 x 14
##       X1 country description designation points price province region_1 region_2
##    <int> <chr>   <chr>       <chr>        <int> <dbl> <chr>    <chr>    <chr>   
##  1     0 Italy   Aromas inc… Vulkà Bian…     87    NA Sicily … Etna     <NA>    
##  2     1 Portug… This is ri… Avidagos        87    15 Douro    <NA>     <NA>    
##  3     2 United… Tart and s… <NA>            87    14 Oregon   Willame… Willame…
##  4     3 United… Pineapple … Reserve La…     87    13 Michigan Lake Mi… <NA>    
##  5     4 United… Much like … Vintner's …     87    65 Oregon   Willame… Willame…
##  6     5 Spain   Blackberry… Ars In Vit…     87    15 Norther… Navarra  <NA>    
##  7     6 Italy   Here's a b… Belsito         87    16 Sicily … Vittoria <NA>    
##  8     7 France  This dry a… <NA>            87    24 Alsace   Alsace   <NA>    
##  9     8 Germany Savory dri… Shine           87    12 Rheinhe… <NA>     <NA>    
## 10     9 France  This has g… Les Natures     87    27 Alsace   Alsace   <NA>    
## # … with 129,961 more rows, and 5 more variables: taster_name <chr>,
## #   taster_twitter_handle <chr>, title <chr>, variety <chr>, winery <chr>

Strings

String basics

library(stringr)
name <- "Jameson Watts"
name
## [1] "Jameson Watts"
quote <- '"As soon as you stop wanting something, you get it." - Andy Warhol'
quote
## [1] "\"As soon as you stop wanting something, you get it.\" - Andy Warhol"
writeLines(quote)
## "As soon as you stop wanting something, you get it." - Andy Warhol

String Operations

str_length(quote)
## [1] 66
name_quote <- str_c("Name: ", name, "\nQuote: ", quote)
writeLines(name_quote)
## Name: Jameson Watts
## Quote: "As soon as you stop wanting something, you get it." - Andy Warhol

Other common string functions

str_sub(name_quote,10,30)
## [1] "eson Watts\nQuote: \"As"
str_to_lower(name_quote)
## [1] "name: jameson watts\nquote: \"as soon as you stop wanting something, you get it.\" - andy warhol"
str_locate(name_quote,"Jameson")
##      start end
## [1,]     7  13

Regular expressions

library(htmlwidgets)
str_detect(name_quote, "on Wa")
## [1] TRUE
str_detect(name_quote, "on wa")
## [1] FALSE
str_view_all(name_quote, "ou")

More complicated Regex

Placeholders and Repetition

  • . maches any character
  • * matches 0 or more
  • + matches 1 or more
  • ? matches 0 or 1

Anchors

  • ^ matches start of string
  • $ matches end of string

Character classes

  • \d matches any digit.
  • \s matches any whitespace (e.g. space, tab, newline).
  • [abc] matches a, b, or c.
  • [^abc] matches anything except a, b, or c.

Some examples

str_view_all(name_quote, "a.+s")
wine %>% 
  filter(str_detect(variety,"[Cc]abernet")) %>% 
  select(points, price, variety) %>% 
  arrange(desc(price))
## # A tibble: 11,746 x 3
##    points price variety                       
##     <int> <dbl> <chr>                         
##  1     94   625 Cabernet Sauvignon            
##  2     98   625 Cabernet Sauvignon            
##  3     97   625 Cabernet Sauvignon            
##  4     92   600 Tempranillo-Cabernet Sauvignon
##  5     93   500 Cabernet Sauvignon            
##  6     89   500 Tempranillo-Cabernet Sauvignon
##  7     95   500 Cabernet Sauvignon            
##  8     89   500 Tempranillo-Cabernet Sauvignon
##  9     92   400 Cabernet Sauvignon            
## 10     96   400 Cabernet Sauvignon            
## # … with 11,736 more rows

Exercise

  1. Use filter() and str_detect() to
  2. find all Oregon wines
  3. with the words ‘espresso’ and ‘black currant’ in their description
  4. showing variety, price, points and winery

Note: these sorts of queries can tax your computer

Solution

wine %>% 
  filter(province=="Oregon") %>% 
  filter(str_detect(description,"[Ee]spresso") & str_detect(description,"[Bb]lack currant")) %>% 
  select(variety, price, points, winery) %>% 
  arrange(desc(points), price)
## # A tibble: 1 x 4
##   variety    price points winery
##   <chr>      <dbl>  <int> <chr> 
## 1 Pinot Noir    66     94 Shea

Bringing it all together

Combining with mutate

wine %>% 
  filter(province=="Oregon") %>% 
  mutate(fruity = str_detect(description,"[Ff]ruity")) %>% 
  ggplot(aes(points, fill=fruity))+
    geom_density(alpha=.5)

Extracting year from title

wine <- wine %>% 
  mutate(year = str_extract(title,"(\\d{4})"))
wine %>% 
  select(title, year)
## # A tibble: 129,971 x 2
##    title                                                                   year 
##    <chr>                                                                   <chr>
##  1 Nicosia 2013 Vulkà Bianco  (Etna)                                       2013 
##  2 Quinta dos Avidagos 2011 Avidagos Red (Douro)                           2011 
##  3 Rainstorm 2013 Pinot Gris (Willamette Valley)                           2013 
##  4 St. Julian 2013 Reserve Late Harvest Riesling (Lake Michigan Shore)     2013 
##  5 Sweet Cheeks 2012 Vintner's Reserve Wild Child Block Pinot Noir (Willa… 2012 
##  6 Tandem 2011 Ars In Vitro Tempranillo-Merlot (Navarra)                   2011 
##  7 Terre di Giurfo 2013 Belsito Frappato (Vittoria)                        2013 
##  8 Trimbach 2012 Gewurztraminer (Alsace)                                   2012 
##  9 Heinz Eifel 2013 Shine Gewürztraminer (Rheinhessen)                     2013 
## 10 Jean-Baptiste Adam 2012 Les Natures Pinot Gris (Alsace)                 2012 
## # … with 129,961 more rows

Graphing points by year

wine %>% 
  filter(province=="Oregon") %>%
  filter(variety=="Pinot Noir" | variety=="Chardonnay" | variety=="Pinot Gris") %>%
  filter(year >= 1995) %>% 
  group_by(year, variety) %>% 
  summarize(avg_points = mean(points)) %>% 
  ggplot(aes(x=as.integer(year), y=avg_points, color=variety)) +
    geom_line()

Joining on more than one column

Now that we’ve extracted year from the title, we can do a join on both country and year

wine %>% 
  left_join(pop) %>% 
  filter(!is.na(population) & !is.na(year)) %>%
  filter(population < 1000000000) %>% 
  group_by(country,year) %>% 
  summarize(population=mean(population),price = mean(price)) %>% 
  ggplot(aes(x=log(population), y=log(price))) +
    geom_jitter(alpha=.1)+
    geom_smooth(method = lm)

Long Exercise

  1. Go to https://data.worldbank.org/indicator
  2. Find some cool country data
  3. Merge it with the wine data
  4. Decide on research question
  5. Draw a cool graph that answers the question

Bonus: joining with yourself

top_wineries <- wine %>% 
  group_by(winery) %>% 
  summarize(
    avg_points=mean(points),
    count=n()) %>% 
  filter(count > 10) %>% 
  top_frac(.05,avg_points) %>% 
  left_join(wine)
top_wineries
## # A tibble: 4,374 x 17
##    winery avg_points count    X1 country description designation points price
##    <chr>       <dbl> <int> <int> <chr>   <chr>       <chr>        <int> <dbl>
##  1 Abeja        92.4    28  6061 United… All variet… Heather Hi…     89    55
##  2 Abeja        92.4    28  6737 United… Fresh and … <NA>            94    36
##  3 Abeja        92.4    28 28201 United… The wine i… <NA>            91    36
##  4 Abeja        92.4    28 33841 United… Sourced fr… Reserve         97    85
##  5 Abeja        92.4    28 33847 United… This 100% … <NA>            95    38
##  6 Abeja        92.4    28 40428 United… Made from … <NA>            92    40
##  7 Abeja        92.4    28 48216 United… This new v… <NA>            93    28
##  8 Abeja        92.4    28 54754 United… Abeja wine… <NA>            94    38
##  9 Abeja        92.4    28 54763 United… Abeja's Ch… <NA>            94    36
## 10 Abeja        92.4    28 54891 United… Walla Wall… Estate Gro…     90    30
## # … with 4,364 more rows, and 8 more variables: province <chr>, region_1 <chr>,
## #   region_2 <chr>, taster_name <chr>, taster_twitter_handle <chr>,
## #   title <chr>, variety <chr>, year <chr>
rainfall
## # A tibble: 48 x 13
##     Year   Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov   Dec
##    <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  1970 13.5   4.46  1.92  2.63  1.36  0.85  0.01 NA     1.81  3.25  7.18  9.74
##  2  1971  6.49  4.34  6.93  4.05  1.89  2.47  0.01  1.49  3.98  3.09  6.27  8.18
##  3  1972  7.98  4.68  4.96  3.79  2.4   0.69  0.12  0.14  2.07  0.7   3.77  8.7 
##  4  1973  5.64  1.62  3.5   1.69  1.11  1.48 NA     0.8   2.8   2.79 15.2  11.1 
##  5  1974 10.9   5.56  7.95  1.48  0.9   0.41  1.8   0.11  0.28  2.15  7.42  6.94
##  6  1975  4.96  4.68  4.22  2.2   1.66  0.81  0.51  1.96  0     5.51  6.06  6.07
##  7  1976  5.47  6.92  3.66  2     1.33  1.04  0.67  1.89  1.13  1.51  1.13  1.26
##  8  1977  0.88  2.83  3.33  0.62  3.76  0.73  0.26  1.7   2.36  2.37  6.19  8.73
##  9  1978  5.67  3.54  1.23  3.5   2.97  0.48  1.07  2.56  2.64  0.37  4.5   2.64
## 10  1979  2.84  7.19  2.17  2.82  2.2   0.65  0.3   0.7   2.19  6.06  3.83  6.95
## # … with 38 more rows
rainfall %>%
    rename("year" = "Year") %>%
    pivot_longer(-year, names_to = "month", values_to = "rainfall")
## # A tibble: 576 x 3
##     year month rainfall
##    <int> <chr>    <dbl>
##  1  1970 Jan      13.5 
##  2  1970 Feb       4.46
##  3  1970 Mar       1.92
##  4  1970 Apr       2.63
##  5  1970 May       1.36
##  6  1970 Jun       0.85
##  7  1970 Jul       0.01
##  8  1970 Aug      NA   
##  9  1970 Sep       1.81
## 10  1970 Oct       3.25
## # … with 566 more rows

load wine data

get rid of NA prices

only oregon wines

extract year from title, as numeric

join with rainfall

pivot longer

new_wine <-wine %>%
  filter(!is.na(price)) %>%
  filter(province == "Oregon") %>%
  mutate(year = as.numeric(str_extract(title,"(\\d{4})"))) %>%
  left_join(rainfall, by = c("year" = "Year")) %>%
  pivot_longer(16:26, names_to = "month", values_to = "rainfall")
new_wine %>%
    select(year, month, rainfall, title)
## # A tibble: 58,949 x 4
##     year month rainfall title                                        
##    <dbl> <chr>    <dbl> <chr>                                        
##  1  2013 Jan       1.63 Rainstorm 2013 Pinot Gris (Willamette Valley)
##  2  2013 Feb       1.42 Rainstorm 2013 Pinot Gris (Willamette Valley)
##  3  2013 Mar       2.21 Rainstorm 2013 Pinot Gris (Willamette Valley)
##  4  2013 Apr       2.39 Rainstorm 2013 Pinot Gris (Willamette Valley)
##  5  2013 May       2.94 Rainstorm 2013 Pinot Gris (Willamette Valley)
##  6  2013 Jun       1.02 Rainstorm 2013 Pinot Gris (Willamette Valley)
##  7  2013 Jul       0    Rainstorm 2013 Pinot Gris (Willamette Valley)
##  8  2013 Aug       0.35 Rainstorm 2013 Pinot Gris (Willamette Valley)
##  9  2013 Sep       7.05 Rainstorm 2013 Pinot Gris (Willamette Valley)
## 10  2013 Oct       0.63 Rainstorm 2013 Pinot Gris (Willamette Valley)
## # … with 58,939 more rows

Better casewhen

lets say you want to do a bunch of if_else statements

new_wine %>%
    mutate(month_number =
             case_when(month == "Jan" ~ 1,
                      month == "Feb" ~ 2,
                      month == "Mar" ~ 3,
                      month == "Apr" ~ 4,
                      month == "May" ~ 5,
                      month == "Jun" ~ 6,
                      month == "Jul" ~ 7,
                      month == "Aug" ~ 8,
                      month == "Sep" ~ 9,
                      month == "Oct" ~ 10,
                      month == "Nov" ~ 11,
                      month == "Dec" ~ 12,
                       ))
## # A tibble: 58,949 x 19
##       X1 country description designation points price province region_1 region_2
##    <int> <chr>   <chr>       <chr>        <int> <dbl> <chr>    <chr>    <chr>   
##  1     2 United… Tart and s… <NA>            87    14 Oregon   Willame… Willame…
##  2     2 United… Tart and s… <NA>            87    14 Oregon   Willame… Willame…
##  3     2 United… Tart and s… <NA>            87    14 Oregon   Willame… Willame…
##  4     2 United… Tart and s… <NA>            87    14 Oregon   Willame… Willame…
##  5     2 United… Tart and s… <NA>            87    14 Oregon   Willame… Willame…
##  6     2 United… Tart and s… <NA>            87    14 Oregon   Willame… Willame…
##  7     2 United… Tart and s… <NA>            87    14 Oregon   Willame… Willame…
##  8     2 United… Tart and s… <NA>            87    14 Oregon   Willame… Willame…
##  9     2 United… Tart and s… <NA>            87    14 Oregon   Willame… Willame…
## 10     2 United… Tart and s… <NA>            87    14 Oregon   Willame… Willame…
## # … with 58,939 more rows, and 10 more variables: taster_name <chr>,
## #   taster_twitter_handle <chr>, title <chr>, variety <chr>, winery <chr>,
## #   year <dbl>, Dec <dbl>, month <chr>, rainfall <dbl>, month_number <dbl>

use case_when and/or str_detect

create “character” w/ values tart/spicy oregon wines w/ those terms in description density plot

new_wine <- new_wine %>%
  pivot_wider(names_from = month, values_from = rainfall) %>%
  mutate(character = case_when(
                     str_detect(description, "[Tt]art") ~ 'tart',
                     str_detect(description, "[Ss]icy") ~ 'spicy'))
new_wine %>%
    filter(!is.na(character)) %>%
    ggplot(aes(log(price), fill=character)) + geom_density(alpha =.7)

new_wine %>%
    filter(year > 1995) %>%
    filter(!is.na(character)) %>%
    ggplot(aes(x=year, y = points, color = character)) + geom_jitter()

## Scatter plot example

new_wine %>% 
  filter(year>1995) %>% 
  filter(!is.na(character)) %>% 
  ggplot(aes(x=year, y=points, color=character)) +
    geom_jitter()

Bar plot example

new_wine %>% 
  filter(!is.na(character)) %>% 
  ggplot(aes(character)) +
    geom_bar()

Exercise

Plot the counts of each character of wine between 1995 and 2015

Solution

new_wine %>% 
  filter(year>1995 & year <= 2015) %>%
  filter(!is.na(character)) %>% 
  group_by(year,character) %>% 
  summarise(char_cnt=n()) %>% 
  ggplot(aes(year,char_cnt, color=character))+
    geom_line()

Histogram example

new_wine %>% 
  ggplot(aes(price))+
    geom_histogram()

Violin Plots

new_wine %>% 
  filter(!is.na(character)) %>% 
  ggplot(aes(character,log(price), color=character))+
    geom_violin()

Long Exercise

Use any of the techniques that you’ve learned thus far to answer the following:

Is there a relationship between rainfall and wine quality in Oregon?

One simple solution (and some bonus code)

rains <- rainfall %>%
  rename("year"="Year") %>% 
  pivot_longer(-year,names_to = 'month',values_to = 'rainfall') %>% 
  mutate(rainfall=ifelse(is.na(rainfall),0,rainfall)) %>%
  filter(month %in% c('May','Jun','Jul','Aug','Sep')) %>% #note the %in% operator
  group_by(year) %>% 
  summarise(summer_rain=sum(rainfall))
new_wine <- new_wine %>% 
  filter(points > 88) %>% 
  group_by(year) %>% 
  summarize(avg_price=mean(price), avg_points=mean(points)) %>% 
  left_join(rains)

And a graph

new_wine %>% 
  ggplot(aes(log(summer_rain),avg_points))+
    geom_point()+
    geom_smooth(method = lm)