library(readr)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──

## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.6     ✓ dplyr   1.0.3
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ ggplot2 3.3.3     ✓ forcats 0.3.0

## Warning: package 'stringr' was built under R version 3.5.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(skimr)

#winemag_data <- read_csv("Downloads/winemag-data.csv")

winemag_data <- read_rds("/Users/Rose/Desktop/Masters In Data Science/Data Science with R/wine.rds")

Filtering Pinot From Oregon

winemag_data %>% 
    filter(province == "Oregon") %>% 
    filter(variety == "Pinot Noir") %>%
    arrange(desc(points), price) %>%
    select(points, price, title)

## # A tibble: 2,737 x 3
##    points price title                                                           
##     <dbl> <dbl> <chr>                                                           
##  1     97    65 Ken Wright 2012 Abbott Claim Vineyard Pinot Noir                
##  2     96    30 Sineann 2015 TFL Pinot Noir (Willamette Valley)                 
##  3     96    40 Scott Paul 2009 Dix Pinot Noir (Dundee Hills)                   
##  4     96    60 Patricia Green Cellars 2015 Estate Vineyard Etzel Block Pinot N…
##  5     96    63 Ken Wright 2014 Bryce Vineyard Pinot Noir (Ribbon Ridge)        
##  6     96    63 Ken Wright 2014 Abbott Claim Vineyard Pinot Noir                
##  7     96    65 Ken Wright 2012 Freedom Hill Vineyard Pinot Noir (Willamette Va…
##  8     96    85 Alloro 2014 Estate Justina Pinot Noir (Chehalem Mountains)      
##  9     96    85 The Eyrie Vineyards 2012 Original Vines Estate Pinot Noir (Dund…
## 10     96    85 Domaine Drouhin Oregon 2011 Édition Limitée Pinot Noir (Dundee …
## # … with 2,727 more rows

Using the “skim()” function

What are the mean price and points for Oregon pinot Noir?

Oregon_pinot  <- winemag_data %>% filter(province == "Oregon") %>% 
    filter(variety == "Pinot Noir") %>%
    arrange(desc(points), price) %>%
    select(points, price, title)


skim(Oregon_pinot)

## Skim summary statistics
##  n obs: 2737 
##  n variables: 3 
##  group variables:  
## 
## ── Variable type:character ─────────────────────────────────────────────────────
##  variable missing complete    n min max empty n_unique
##     title       0     2737 2737  20  99     0     2512
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────
##  variable missing complete    n  mean    sd p0 p25 p50 p75 p100     hist
##    points       0     2737 2737 89.49  2.66 80  88  90  91   97 ▁▁▃▇▇▇▃▁
##     price       0     2737 2737 44.86 20.21  9  30  42  55  275 ▇▇▁▁▁▁▁▁

winemag_data %>% filter(province == "Oregon") %>% 
    filter(variety == "Pinot Noir") %>%
    arrange(desc(points), price) %>%
    select(points, price) %>% 
    skim()

## Skim summary statistics
##  n obs: 2737 
##  n variables: 2 
##  group variables:  
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────
##  variable missing complete    n  mean    sd p0 p25 p50 p75 p100     hist
##    points       0     2737 2737 89.49  2.66 80  88  90  91   97 ▁▁▃▇▇▇▃▁
##     price       0     2737 2737 44.86 20.21  9  30  42  55  275 ▇▇▁▁▁▁▁▁

Next Level: - and = ‘&’ - or = ‘|’ - not = ‘!’ - not equal = ‘!=’ - top_n() = number of top items you want, and the column you want the top items for - top_frac() =

Looking at Top 10 by price, looking at points & title

Arrange by points descending

winemag_data %>% 
  filter(country == "France" | country == "Italy") %>%
  top_n(10, price) %>%
  select(points, price, title) %>%
  arrange(desc(points))

## # A tibble: 10 x 3
##    points price title                                         
##     <dbl> <dbl> <chr>                                         
##  1    100  1500 Château Lafite Rothschild 2010  Pauillac      
##  2    100  1500 Château Cheval Blanc 2010  Saint-Émilion      
##  3     98  1900 Château Margaux 2009  Margaux                 
##  4     97  2000 Château Pétrus 2011  Pomerol                  
##  5     96  1200 Château Haut-Brion 2009  Pessac-Léognan       
##  6     96  1300 Château Mouton Rothschild 2009  Pauillac      
##  7     96  2500 Château Pétrus 2014  Pomerol                  
##  8     96  2500 Domaine du Comte Liger-Belair 2010  La Romanée
##  9     96  2000 Domaine du Comte Liger-Belair 2005  La Romanée
## 10     88  3300 Château les Ormes Sorbet 2013  Médoc

Lets kick it up a little:

top 5 oregon wines by points
that arent Chardonnay
showing only points, price and title
arrange by price ascending

winemag_data %>%
  filter(province == "Oregon" & variety != "Chardonnay") %>%
  top_n(5,points) %>%
  select(points, price, title) %>%
  arrange(price)

## # A tibble: 7 x 3
##   points price title                                                            
##    <dbl> <dbl> <chr>                                                            
## 1     97    65 Ken Wright 2012 Abbott Claim Vineyard Pinot Noir                 
## 2     99    75 Cayuse 2009 En Chamberlin Vineyard Syrah (Walla Walla Valley (OR…
## 3     99    75 Cayuse 2011 En Chamberlin Vineyard Syrah (Walla Walla Valley (OR…
## 4     98    75 Cayuse 2011 En Cerise Vineyard Syrah (Walla Walla Valley (OR))   
## 5     97    80 Cayuse 2009 The Widowmaker Cabernet Sauvignon (Walla Walla Valle…
## 6     97    85 Cayuse 2009 Armada Vineyard Syrah (Walla Walla Valley (OR))      
## 7     97    90 Cayuse 2011 Widowmaker En Chamberlin Vineyard Cabernet Sauvignon…

Even though there are 7 rows being on the table, there are 4 with 97 points: showing the ties
top frac gives top fraction of a group
top_frac(.5, points) means 50%

use the top_frac functions to find.. - top 5% by points - oregon wines - that are neither pinot noir nor chardonnay - shoing only points, price and title - arrange by points descending and price ascending

library(dplyr)

winemag_data %>%
  filter(province == "Oregon") %>%
  filter(variety != "Chardonnay" & variety != "Pinot Noir") %>%
  top_frac(.05, points) %>%
  select(points, price, title) %>%
  arrange(desc(points), price)

## # A tibble: 123 x 3
##    points price title                                                           
##     <dbl> <dbl> <chr>                                                           
##  1     99    75 Cayuse 2009 En Chamberlin Vineyard Syrah (Walla Walla Valley (O…
##  2     99    75 Cayuse 2011 En Chamberlin Vineyard Syrah (Walla Walla Valley (O…
##  3     98    75 Cayuse 2011 En Cerise Vineyard Syrah (Walla Walla Valley (OR))  
##  4     97    80 Cayuse 2009 The Widowmaker Cabernet Sauvignon (Walla Walla Vall…
##  5     97    85 Cayuse 2009 Armada Vineyard Syrah (Walla Walla Valley (OR))     
##  6     97    90 Cayuse 2011 Widowmaker En Chamberlin Vineyard Cabernet Sauvigno…
##  7     96    38 Trisaetum 2015 Estates Reserve Riesling (Willamette Valley)     
##  8     96    70 Cayuse 2012 Cailloux Vineyard Viognier (Walla Walla Valley (OR))
##  9     96    75 Cayuse 2009 Camaspelo Cabernet Sauvignon-Merlot (Walla Walla Va…
## 10     96    90 Cayuse 2011 Cailloux Vineyard Syrah (Walla Walla Valley (OR))   
## # … with 113 more rows

TAME & TIDY DATA

tame data is data w/ understandable column names and well-formatted values
tidy data is data with:
- Each variable must have its own column
- Each observation must have its own row
- Each value must have its own cell

wine_oregon <- winemag_data %>% filter(province == "Oregon")

wine_oregon

## # A tibble: 5,147 x 15
##       id country description designation points price province region_1 region_2
##    <dbl> <chr>   <chr>       <chr>        <dbl> <dbl> <chr>    <chr>    <chr>   
##  1     2 US      Tart and s… <NA>            87    14 Oregon   Willame… Willame…
##  2     4 US      Much like … Vintner's …     87    65 Oregon   Willame… Willame…
##  3    21 US      A sleek mi… <NA>            87    20 Oregon   Oregon   Oregon …
##  4    35 US      As with ma… Hyland          86    50 Oregon   McMinnv… Willame…
##  5    41 US      A stiff, t… <NA>            86    22 Oregon   Willame… Willame…
##  6    78 US      Some rosés… Rosé of         86    25 Oregon   Eola-Am… Willame…
##  7   173 US      This wine … <NA>            91    38 Oregon   Willame… Willame…
##  8   233 US      There is a… Reserve         85    28 Oregon   Willame… Willame…
##  9   248 US      This seems… Estate Sin…     85    45 Oregon   Willame… Willame…
## 10   251 US      Spicy and … Papillon E…     85    22 Oregon   Willame… Willame…
## # … with 5,137 more rows, and 6 more variables: taster_name <chr>,
## #   taster_twitter_handle <chr>, title <chr>, variety <chr>, winery <chr>,
## #   year <dbl>

Data Summarization

data summarization involves
- describing data w/ numerical summaries
- visualizing data w/ graphical summaries
There is a difference in how we describe the data depending on whether it is
- discrete
- continuous

Describing discrete data (country has discrete data)

winemag_data %>%
    count(country)

## # A tibble: 43 x 2
##    country                    n
##  * <chr>                  <int>
##  1 Argentina               3578
##  2 Armenia                    2
##  3 Australia               1905
##  4 Austria                 2523
##  5 Bosnia and Herzegovina     2
##  6 Brazil                    34
##  7 Bulgaria                 136
##  8 Canada                   243
##  9 Chile                   4162
## 10 China                      1
## # … with 33 more rows

winemag_data %>%
    count(country, variety)

## # A tibble: 1,470 x 3
##    country   variety                               n
##    <chr>     <chr>                             <int>
##  1 Argentina Barbera                               1
##  2 Argentina Bonarda                             103
##  3 Argentina Bordeaux-style Red Blend             86
##  4 Argentina Bordeaux-style White Blend            1
##  5 Argentina Cabernet Blend                        8
##  6 Argentina Cabernet Franc                       61
##  7 Argentina Cabernet Franc-Cabernet Sauvignon     3
##  8 Argentina Cabernet Franc-Malbec                 4
##  9 Argentina Cabernet Sauvignon                  525
## 10 Argentina Cabernet Sauvignon-Cabernet Franc     1
## # … with 1,460 more rows

Use filter and count to figure out which country has more chardonnay: france or us?

winemag_data %>% 
  filter(country == "US" | country == "France") %>%
  filter(variety == "Chardonnay") %>%
  count(country)

## # A tibble: 2 x 2
##   country     n
## * <chr>   <int>
## 1 France   1969
## 2 US       4170

Visualization Basics

ggplot2 requires the following:
- Data - Data visualize
- Aesthetics - Mapping graphical elements to data
- Geometries - Or “geom,” the graphic representing the data

wine <- winemag_data

wine %>%
  filter(country == "France" | country == "US") %>%
  filter(variety == "Chardonnay") %>%
  ggplot(aes(x=country)) + geom_bar()

Now lets compare all of the different states that produce wine and fill with variety

wine %>%
  filter(province == "Washington" | province == "Oregon" | province == "California") %>%
  filter(variety == "Cabernet Sauvignon" | variety == "Syrah" | variety == "Pinot Noir") %>%
  ggplot(aes(x=province, fill = variety)) + geom_bar()

Create a stacked bar graph that shows - a count of wines - w/greater than 97 points - from Califrnia, Oregon, Washington - stacked by variety

wine %>%
  filter(province == "California" | province == "Oregon" | province == "Washington") %>%
  filter(points > 97) %>%
  ggplot(aes(x=province, fill= variety)) + geom_bar()

fill shows what i want to stack it by

Describing continuous data

you can use the summarize function for calculating things like mean, median, variance, min/max, etc.

wine %>%
  summarize(avg_points = mean(points))

## # A tibble: 1 x 1
##   avg_points
##        <dbl>
## 1       88.6

Lets graph it

wine %>%
  ggplot(aes(x=points)) + geom_histogram()

We can look at the relationship between two variables

wine %>%
    ggplot(aes(x=points, y=log(price))) + geom_point()

We can combine the two - group provinces in US - summarizing by different variables -!is.na <- removes all of the price variables that don’t have a price included - we create average_points by the mean(points) -same with average_price by mean(price) - and filter this by more than 100 - then arrange by average points descending

wine %>%
    filter(country == "US") %>%
    filter(!is.na(price)) %>%
    group_by(province) %>%
    summarize(
      count = n(),
      average_points = mean(points),
      average_price = mean(price)) %>%
  filter(count>100) %>%
  arrange(desc(average_points))

## # A tibble: 7 x 4
##   province   count average_points average_price
##   <chr>      <int>          <dbl>         <dbl>
## 1 California 19073           89.4          40.7
## 2 Oregon      5147           89.1          37.1
## 3 Washington  8353           89.0          32.8
## 4 New York    2364           87.3          23.3
## 5 Idaho        179           86.6          21.0
## 6 Michigan     101           86.3          33.3
## 7 Virginia     345           86.0          28.6

Create a tibble that shoes - US wines - grouped by province and variety - summarized on count and max price - w/ a count greater than 100 - sorted by count descenting

wine %>%
  filter(country == "US") %>%
  filter(!is.na(price)) %>%
  group_by(province) %>%  # my summary statistics are goint to be categorized by province/variety
  group_by(variety) %>%
  summarize(
    count = n(),
    max_price = max(price)) %>%
  filter(count>100) %>%
  arrange(desc(count))

## # A tibble: 32 x 3
##    variety                  count max_price
##    <chr>                    <int>     <dbl>
##  1 Pinot Noir                6871       275
##  2 Cabernet Sauvignon        4208       625
##  3 Chardonnay                4170      2013
##  4 Syrah                     2324       750
##  5 Red Blend                 2116       185
##  6 Zinfandel                 1559       100
##  7 Riesling                  1495       200
##  8 Merlot                    1430       200
##  9 Bordeaux-style Red Blend  1293       350
## 10 Sauvignon Blanc           1147        75
## # … with 22 more rows

wine %>%
  filter(province == "California" | province =="Oregon" | province== "Washington") %>%
  ggplot(aes(x=log(price), fill=province)) + geom_density(alpha = 0.4)

Tidy Data & Summarise Function

Matt

9/21/2019