library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.6 ✓ dplyr 1.0.3
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ ggplot2 3.3.3 ✓ forcats 0.3.0
## Warning: package 'stringr' was built under R version 3.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(skimr)
#winemag_data <- read_csv("Downloads/winemag-data.csv")
winemag_data <- read_rds("/Users/Rose/Desktop/Masters In Data Science/Data Science with R/wine.rds")
winemag_data %>%
filter(province == "Oregon") %>%
filter(variety == "Pinot Noir") %>%
arrange(desc(points), price) %>%
select(points, price, title)
## # A tibble: 2,737 x 3
## points price title
## <dbl> <dbl> <chr>
## 1 97 65 Ken Wright 2012 Abbott Claim Vineyard Pinot Noir
## 2 96 30 Sineann 2015 TFL Pinot Noir (Willamette Valley)
## 3 96 40 Scott Paul 2009 Dix Pinot Noir (Dundee Hills)
## 4 96 60 Patricia Green Cellars 2015 Estate Vineyard Etzel Block Pinot N…
## 5 96 63 Ken Wright 2014 Bryce Vineyard Pinot Noir (Ribbon Ridge)
## 6 96 63 Ken Wright 2014 Abbott Claim Vineyard Pinot Noir
## 7 96 65 Ken Wright 2012 Freedom Hill Vineyard Pinot Noir (Willamette Va…
## 8 96 85 Alloro 2014 Estate Justina Pinot Noir (Chehalem Mountains)
## 9 96 85 The Eyrie Vineyards 2012 Original Vines Estate Pinot Noir (Dund…
## 10 96 85 Domaine Drouhin Oregon 2011 Édition Limitée Pinot Noir (Dundee …
## # … with 2,727 more rows
Oregon_pinot <- winemag_data %>% filter(province == "Oregon") %>%
filter(variety == "Pinot Noir") %>%
arrange(desc(points), price) %>%
select(points, price, title)
skim(Oregon_pinot)
## Skim summary statistics
## n obs: 2737
## n variables: 3
## group variables:
##
## ── Variable type:character ─────────────────────────────────────────────────────
## variable missing complete n min max empty n_unique
## title 0 2737 2737 20 99 0 2512
##
## ── Variable type:numeric ───────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75 p100 hist
## points 0 2737 2737 89.49 2.66 80 88 90 91 97 ▁▁▃▇▇▇▃▁
## price 0 2737 2737 44.86 20.21 9 30 42 55 275 ▇▇▁▁▁▁▁▁
winemag_data %>% filter(province == "Oregon") %>%
filter(variety == "Pinot Noir") %>%
arrange(desc(points), price) %>%
select(points, price) %>%
skim()
## Skim summary statistics
## n obs: 2737
## n variables: 2
## group variables:
##
## ── Variable type:numeric ───────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75 p100 hist
## points 0 2737 2737 89.49 2.66 80 88 90 91 97 ▁▁▃▇▇▇▃▁
## price 0 2737 2737 44.86 20.21 9 30 42 55 275 ▇▇▁▁▁▁▁▁
Next Level: - and = ‘&’ - or = ‘|’ - not = ‘!’ - not equal = ‘!=’ - top_n() = number of top items you want, and the column you want the top items for - top_frac() =
winemag_data %>%
filter(country == "France" | country == "Italy") %>%
top_n(10, price) %>%
select(points, price, title) %>%
arrange(desc(points))
## # A tibble: 10 x 3
## points price title
## <dbl> <dbl> <chr>
## 1 100 1500 Château Lafite Rothschild 2010 Pauillac
## 2 100 1500 Château Cheval Blanc 2010 Saint-Émilion
## 3 98 1900 Château Margaux 2009 Margaux
## 4 97 2000 Château Pétrus 2011 Pomerol
## 5 96 1200 Château Haut-Brion 2009 Pessac-Léognan
## 6 96 1300 Château Mouton Rothschild 2009 Pauillac
## 7 96 2500 Château Pétrus 2014 Pomerol
## 8 96 2500 Domaine du Comte Liger-Belair 2010 La Romanée
## 9 96 2000 Domaine du Comte Liger-Belair 2005 La Romanée
## 10 88 3300 Château les Ormes Sorbet 2013 Médoc
winemag_data %>%
filter(province == "Oregon" & variety != "Chardonnay") %>%
top_n(5,points) %>%
select(points, price, title) %>%
arrange(price)
## # A tibble: 7 x 3
## points price title
## <dbl> <dbl> <chr>
## 1 97 65 Ken Wright 2012 Abbott Claim Vineyard Pinot Noir
## 2 99 75 Cayuse 2009 En Chamberlin Vineyard Syrah (Walla Walla Valley (OR…
## 3 99 75 Cayuse 2011 En Chamberlin Vineyard Syrah (Walla Walla Valley (OR…
## 4 98 75 Cayuse 2011 En Cerise Vineyard Syrah (Walla Walla Valley (OR))
## 5 97 80 Cayuse 2009 The Widowmaker Cabernet Sauvignon (Walla Walla Valle…
## 6 97 85 Cayuse 2009 Armada Vineyard Syrah (Walla Walla Valley (OR))
## 7 97 90 Cayuse 2011 Widowmaker En Chamberlin Vineyard Cabernet Sauvignon…
Even though there are 7 rows being on the table, there are 4 with 97 points: showing the ties
top_frac(.5, points) means 50%
use the top_frac functions to find.. - top 5% by points - oregon wines - that are neither pinot noir nor chardonnay - shoing only points, price and title - arrange by points descending and price ascending
library(dplyr)
winemag_data %>%
filter(province == "Oregon") %>%
filter(variety != "Chardonnay" & variety != "Pinot Noir") %>%
top_frac(.05, points) %>%
select(points, price, title) %>%
arrange(desc(points), price)
## # A tibble: 123 x 3
## points price title
## <dbl> <dbl> <chr>
## 1 99 75 Cayuse 2009 En Chamberlin Vineyard Syrah (Walla Walla Valley (O…
## 2 99 75 Cayuse 2011 En Chamberlin Vineyard Syrah (Walla Walla Valley (O…
## 3 98 75 Cayuse 2011 En Cerise Vineyard Syrah (Walla Walla Valley (OR))
## 4 97 80 Cayuse 2009 The Widowmaker Cabernet Sauvignon (Walla Walla Vall…
## 5 97 85 Cayuse 2009 Armada Vineyard Syrah (Walla Walla Valley (OR))
## 6 97 90 Cayuse 2011 Widowmaker En Chamberlin Vineyard Cabernet Sauvigno…
## 7 96 38 Trisaetum 2015 Estates Reserve Riesling (Willamette Valley)
## 8 96 70 Cayuse 2012 Cailloux Vineyard Viognier (Walla Walla Valley (OR))
## 9 96 75 Cayuse 2009 Camaspelo Cabernet Sauvignon-Merlot (Walla Walla Va…
## 10 96 90 Cayuse 2011 Cailloux Vineyard Syrah (Walla Walla Valley (OR))
## # … with 113 more rows
wine_oregon <- winemag_data %>% filter(province == "Oregon")
wine_oregon
## # A tibble: 5,147 x 15
## id country description designation points price province region_1 region_2
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 2 US Tart and s… <NA> 87 14 Oregon Willame… Willame…
## 2 4 US Much like … Vintner's … 87 65 Oregon Willame… Willame…
## 3 21 US A sleek mi… <NA> 87 20 Oregon Oregon Oregon …
## 4 35 US As with ma… Hyland 86 50 Oregon McMinnv… Willame…
## 5 41 US A stiff, t… <NA> 86 22 Oregon Willame… Willame…
## 6 78 US Some rosés… Rosé of 86 25 Oregon Eola-Am… Willame…
## 7 173 US This wine … <NA> 91 38 Oregon Willame… Willame…
## 8 233 US There is a… Reserve 85 28 Oregon Willame… Willame…
## 9 248 US This seems… Estate Sin… 85 45 Oregon Willame… Willame…
## 10 251 US Spicy and … Papillon E… 85 22 Oregon Willame… Willame…
## # … with 5,137 more rows, and 6 more variables: taster_name <chr>,
## # taster_twitter_handle <chr>, title <chr>, variety <chr>, winery <chr>,
## # year <dbl>
winemag_data %>%
count(country)
## # A tibble: 43 x 2
## country n
## * <chr> <int>
## 1 Argentina 3578
## 2 Armenia 2
## 3 Australia 1905
## 4 Austria 2523
## 5 Bosnia and Herzegovina 2
## 6 Brazil 34
## 7 Bulgaria 136
## 8 Canada 243
## 9 Chile 4162
## 10 China 1
## # … with 33 more rows
winemag_data %>%
count(country, variety)
## # A tibble: 1,470 x 3
## country variety n
## <chr> <chr> <int>
## 1 Argentina Barbera 1
## 2 Argentina Bonarda 103
## 3 Argentina Bordeaux-style Red Blend 86
## 4 Argentina Bordeaux-style White Blend 1
## 5 Argentina Cabernet Blend 8
## 6 Argentina Cabernet Franc 61
## 7 Argentina Cabernet Franc-Cabernet Sauvignon 3
## 8 Argentina Cabernet Franc-Malbec 4
## 9 Argentina Cabernet Sauvignon 525
## 10 Argentina Cabernet Sauvignon-Cabernet Franc 1
## # … with 1,460 more rows
Use filter and count to figure out which country has more chardonnay: france or us?
winemag_data %>%
filter(country == "US" | country == "France") %>%
filter(variety == "Chardonnay") %>%
count(country)
## # A tibble: 2 x 2
## country n
## * <chr> <int>
## 1 France 1969
## 2 US 4170
wine <- winemag_data
wine %>%
filter(country == "France" | country == "US") %>%
filter(variety == "Chardonnay") %>%
ggplot(aes(x=country)) + geom_bar()
Now lets compare all of the different states that produce wine and fill with variety
wine %>%
filter(province == "Washington" | province == "Oregon" | province == "California") %>%
filter(variety == "Cabernet Sauvignon" | variety == "Syrah" | variety == "Pinot Noir") %>%
ggplot(aes(x=province, fill = variety)) + geom_bar()
Create a stacked bar graph that shows - a count of wines - w/greater than 97 points - from Califrnia, Oregon, Washington - stacked by variety
wine %>%
filter(province == "California" | province == "Oregon" | province == "Washington") %>%
filter(points > 97) %>%
ggplot(aes(x=province, fill= variety)) + geom_bar()
fill shows what i want to stack it by
wine %>%
summarize(avg_points = mean(points))
## # A tibble: 1 x 1
## avg_points
## <dbl>
## 1 88.6
Lets graph it
wine %>%
ggplot(aes(x=points)) + geom_histogram()
We can look at the relationship between two variables
wine %>%
ggplot(aes(x=points, y=log(price))) + geom_point()
We can combine the two - group provinces in US - summarizing by different variables -!is.na <- removes all of the price variables that don’t have a price included - we create average_points by the mean(points) -same with average_price by mean(price) - and filter this by more than 100 - then arrange by average points descending
wine %>%
filter(country == "US") %>%
filter(!is.na(price)) %>%
group_by(province) %>%
summarize(
count = n(),
average_points = mean(points),
average_price = mean(price)) %>%
filter(count>100) %>%
arrange(desc(average_points))
## # A tibble: 7 x 4
## province count average_points average_price
## <chr> <int> <dbl> <dbl>
## 1 California 19073 89.4 40.7
## 2 Oregon 5147 89.1 37.1
## 3 Washington 8353 89.0 32.8
## 4 New York 2364 87.3 23.3
## 5 Idaho 179 86.6 21.0
## 6 Michigan 101 86.3 33.3
## 7 Virginia 345 86.0 28.6
Create a tibble that shoes - US wines - grouped by province and variety - summarized on count and max price - w/ a count greater than 100 - sorted by count descenting
wine %>%
filter(country == "US") %>%
filter(!is.na(price)) %>%
group_by(province) %>% # my summary statistics are goint to be categorized by province/variety
group_by(variety) %>%
summarize(
count = n(),
max_price = max(price)) %>%
filter(count>100) %>%
arrange(desc(count))
## # A tibble: 32 x 3
## variety count max_price
## <chr> <int> <dbl>
## 1 Pinot Noir 6871 275
## 2 Cabernet Sauvignon 4208 625
## 3 Chardonnay 4170 2013
## 4 Syrah 2324 750
## 5 Red Blend 2116 185
## 6 Zinfandel 1559 100
## 7 Riesling 1495 200
## 8 Merlot 1430 200
## 9 Bordeaux-style Red Blend 1293 350
## 10 Sauvignon Blanc 1147 75
## # … with 22 more rows
wine %>%
filter(province == "California" | province =="Oregon" | province== "Washington") %>%
ggplot(aes(x=log(price), fill=province)) + geom_density(alpha = 0.4)