library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------ tidyverse 1.3.0 --
## <U+221A> ggplot2 3.3.3 <U+221A> purrr 0.3.4
## <U+221A> tibble 3.0.0 <U+221A> stringr 1.4.0
## <U+221A> tidyr 1.0.2 <U+221A> forcats 0.4.0
## -- Conflicts --------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(rpivotTable)
wines <- read_csv("winemag-data-130k-v2.csv")
wines
## # A tibble: 129,971 x 14
## X1 country description designation points price province region_1
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 0 Italy Aromas inc~ Vulka Bian~ 87 NA Sicily ~ Etna
## 2 1 Portug~ This is ri~ Avidagos 87 15 Douro <NA>
## 3 2 US Tart and s~ <NA> 87 14 Oregon Willame~
## 4 3 US Pineapple ~ Reserve La~ 87 13 Michigan Lake Mi~
## 5 4 US Much like ~ Vintner's ~ 87 65 Oregon Willame~
## 6 5 Spain Blackberry~ Ars In Vit~ 87 15 Norther~ Navarra
## 7 6 Italy Here's a b~ Belsito 87 16 Sicily ~ Vittoria
## 8 7 France This dry a~ <NA> 87 24 Alsace Alsace
## 9 8 Germany Savory dri~ Shine 87 12 Rheinhe~ <NA>
## 10 9 France This has g~ Les Natures 87 27 Alsace Alsace
## # ... with 129,961 more rows, and 6 more variables: region_2 <chr>,
## # taster_name <chr>, taster_twitter_handle <chr>, title <chr>,
## # variety <chr>, winery <chr>
glimpse(wines)
## Rows: 129,971
## Columns: 14
## $ X1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12...
## $ country <chr> "Italy", "Portugal", "US", "US", "US", "...
## $ description <chr> "Aromas include tropical fruit, broom, b...
## $ designation <chr> "Vulka Bianco", "Avidagos", NA, "Reserve...
## $ points <dbl> 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, ...
## $ price <dbl> NA, 15, 14, 13, 65, 15, 16, 24, 12, 27, ...
## $ province <chr> "Sicily & Sardinia", "Douro", "Oregon", ...
## $ region_1 <chr> "Etna", NA, "Willamette Valley", "Lake M...
## $ region_2 <chr> NA, NA, "Willamette Valley", NA, "Willam...
## $ taster_name <chr> "Kerin O’Keefe", "Roger Voss", "Paul Gre...
## $ taster_twitter_handle <chr> "@kerinokeefe", "@vossroger", "@paulgwin...
## $ title <chr> "Nicosia 2013 Vulka Bianco (Etna)", "Qu...
## $ variety <chr> "White Blend", "Portuguese Red", "Pinot ...
## $ winery <chr> "Nicosia", "Quinta dos Avidagos", "Rains...
table(wines$country)
##
## Argentina Armenia Australia
## 3800 2 2329
## Austria Bosnia and Herzegovina Brazil
## 3345 2 52
## Bulgaria Canada Chile
## 141 257 4472
## China Croatia Cyprus
## 1 73 11
## Czech Republic Egypt England
## 12 1 74
## France Georgia Germany
## 22093 86 2165
## Greece Hungary India
## 466 146 9
## Israel Italy Lebanon
## 505 19540 35
## Luxembourg Macedonia Mexico
## 6 12 70
## Moldova Morocco New Zealand
## 59 28 1419
## Peru Portugal Romania
## 16 5691 120
## Serbia Slovakia Slovenia
## 12 1 87
## South Africa Spain Switzerland
## 1401 6645 7
## Turkey Ukraine Uruguay
## 90 14 109
## US
## 54504
Skrót klawiszowy: ctrl+shift+m -> %>%
wines%>%
filter( points >= 94, price < 25)
## # A tibble: 66 x 14
## X1 country description designation points price province region_1
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 5011 US Truly stun~ Lewis Esta~ 95 20 Washing~ Columbi~
## 2 6267 US This taste~ Lucille La~ 94 18 Washing~ Yakima ~
## 3 10763 Portug~ His skills~ Rapariga d~ 94 23 Alentej~ <NA>
## 4 12944 France The Côte d~ Côte du Py~ 94 24 Beaujol~ Morgon
## 5 12945 France Be gratefu~ Vieilles V~ 94 24 Beaujol~ Moulin-~
## 6 12967 France A firm and~ <NA> 94 24 Beaujol~ Moulin-~
## 7 15196 France The home v~ Château Bo~ 95 20 Southwe~ Madiran
## 8 15211 US The deep g~ <NA> 94 22 Oregon Willame~
## 9 17294 US Opulento i~ Opulento D~ 94 20 Washing~ Yakima ~
## 10 17983 France This is on~ <NA> 94 20 Provence Coteaux~
## # ... with 56 more rows, and 6 more variables: region_2 <chr>,
## # taster_name <chr>, taster_twitter_handle <chr>, title <chr>,
## # variety <chr>, winery <chr>
Losowanie próbki 15% obserwacji ze zbioru.
wines%>%
sample_frac( 0.15)
## # A tibble: 19,496 x 14
## X1 country description designation points price province region_1
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 68617 France An impress~ Le Bouissel 90 20 Southwe~ Fronton
## 2 13109 Italy Here's a R~ Sotto Cast~ 89 50 Piedmont Barolo
## 3 79392 US Earthy cra~ <NA> 90 27 Califor~ Carneros
## 4 9481 US Shows the ~ <NA> 87 23 Califor~ Carneros
## 5 108921 US As soon as~ Roseum Hue~ 84 13 Califor~ Paso Ro~
## 6 124713 Portug~ Aged for s~ MR Premium 90 NA Alentej~ <NA>
## 7 25399 France The highes~ Clos Trigu~ 94 30 Southwe~ Cahors
## 8 60906 US This is a ~ Fox Family~ 89 45 Califor~ Santa B~
## 9 11859 Italy Passoro op~ Passoro 50~ 84 23 Sicily ~ Sicilia
## 10 46010 Portug~ One of a s~ Single Har~ 95 300 Port <NA>
## # ... with 19,486 more rows, and 6 more variables: region_2 <chr>,
## # taster_name <chr>, taster_twitter_handle <chr>, title <chr>,
## # variety <chr>, winery <chr>
wines%>%
top_n( 3, points)
## # A tibble: 19 x 14
## X1 country description designation points price province region_1
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 345 Austra~ This wine ~ Rare 100 350 Victoria Rutherg~
## 2 7335 Italy Thick as m~ Occhio di ~ 100 210 Tuscany Vin San~
## 3 36528 France This is a ~ Brut 100 259 Champag~ Champag~
## 4 39286 Italy A perfect ~ Masseto 100 460 Tuscany Toscana
## 5 42197 Portug~ This is th~ Barca-Velha 100 450 Douro <NA>
## 6 45781 Italy This gorge~ Riserva 100 550 Tuscany Brunell~
## 7 45798 US Tasted in ~ <NA> 100 200 Califor~ Napa Va~
## 8 58352 France This is a ~ <NA> 100 150 Bordeaux Saint-J~
## 9 89728 France This lates~ Cristal Vi~ 100 250 Champag~ Champag~
## 10 89729 France This new r~ Le Mesnil ~ 100 617 Champag~ Champag~
## 11 111753 France Almost bla~ <NA> 100 1500 Bordeaux Pauillac
## 12 111754 Italy It takes o~ Cerretalto 100 270 Tuscany Brunell~
## 13 111755 France This is th~ <NA> 100 1500 Bordeaux Saint-É~
## 14 111756 France A hugely p~ <NA> 100 359 Bordeaux Saint-J~
## 15 113929 US In 2005 Ch~ Royal City 100 80 Washing~ Columbi~
## 16 114972 Portug~ A powerful~ Nacional V~ 100 650 Port <NA>
## 17 118058 US This wine ~ La Muse 100 450 Califor~ Sonoma ~
## 18 122935 France Full of ri~ <NA> 100 848 Bordeaux Pessac-~
## 19 123545 US Initially ~ Bionic Frog 100 80 Washing~ Walla W~
## # ... with 6 more variables: region_2 <chr>, taster_name <chr>,
## # taster_twitter_handle <chr>, title <chr>, variety <chr>, winery <chr>
wines%>%
top_n( 100, -price)
## # A tibble: 177 x 14
## X1 country description designation points price province region_1
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 1620 Portug~ The very l~ Brado Bran~ 85 6 Alentej~ <NA>
## 2 1987 Spain Berry and ~ Flirty Bird 85 4 Central~ Vino de~
## 3 2335 US Reserved a~ <NA> 85 6 Washing~ Washing~
## 4 2618 Argent~ Lightly br~ <NA> 83 6 Mendoza~ Mendoza
## 5 2780 Portug~ This feels~ Morgado da~ 84 5 Alentej~ <NA>
## 6 3167 Italy Packaged i~ Mini 86 5 Veneto Prosecco
## 7 3948 Portug~ Soft, swee~ Coreto 83 6 Lisboa <NA>
## 8 3950 Portug~ On the dry~ Escolha 83 5 Vinho V~ <NA>
## 9 5152 Spain A steal fo~ Vina Borgia 87 6 Norther~ Campo d~
## 10 5789 France This is a ~ <NA> 83 5 France ~ Vin de ~
## # ... with 167 more rows, and 6 more variables: region_2 <chr>,
## # taster_name <chr>, taster_twitter_handle <chr>, title <chr>,
## # variety <chr>, winery <chr>
wines%>%
arrange( desc(points))
## # A tibble: 129,971 x 14
## X1 country description designation points price province region_1
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 345 Austra~ This wine ~ Rare 100 350 Victoria Rutherg~
## 2 7335 Italy Thick as m~ Occhio di ~ 100 210 Tuscany Vin San~
## 3 36528 France This is a ~ Brut 100 259 Champag~ Champag~
## 4 39286 Italy A perfect ~ Masseto 100 460 Tuscany Toscana
## 5 42197 Portug~ This is th~ Barca-Velha 100 450 Douro <NA>
## 6 45781 Italy This gorge~ Riserva 100 550 Tuscany Brunell~
## 7 45798 US Tasted in ~ <NA> 100 200 Califor~ Napa Va~
## 8 58352 France This is a ~ <NA> 100 150 Bordeaux Saint-J~
## 9 89728 France This lates~ Cristal Vi~ 100 250 Champag~ Champag~
## 10 89729 France This new r~ Le Mesnil ~ 100 617 Champag~ Champag~
## # ... with 129,961 more rows, and 6 more variables: region_2 <chr>,
## # taster_name <chr>, taster_twitter_handle <chr>, title <chr>,
## # variety <chr>, winery <chr>
wines%>%
arrange( -points)
## # A tibble: 129,971 x 14
## X1 country description designation points price province region_1
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 345 Austra~ This wine ~ Rare 100 350 Victoria Rutherg~
## 2 7335 Italy Thick as m~ Occhio di ~ 100 210 Tuscany Vin San~
## 3 36528 France This is a ~ Brut 100 259 Champag~ Champag~
## 4 39286 Italy A perfect ~ Masseto 100 460 Tuscany Toscana
## 5 42197 Portug~ This is th~ Barca-Velha 100 450 Douro <NA>
## 6 45781 Italy This gorge~ Riserva 100 550 Tuscany Brunell~
## 7 45798 US Tasted in ~ <NA> 100 200 Califor~ Napa Va~
## 8 58352 France This is a ~ <NA> 100 150 Bordeaux Saint-J~
## 9 89728 France This lates~ Cristal Vi~ 100 250 Champag~ Champag~
## 10 89729 France This new r~ Le Mesnil ~ 100 617 Champag~ Champag~
## # ... with 129,961 more rows, and 6 more variables: region_2 <chr>,
## # taster_name <chr>, taster_twitter_handle <chr>, title <chr>,
## # variety <chr>, winery <chr>
wines%>%
select( country, province:region_2)
## # A tibble: 129,971 x 4
## country province region_1 region_2
## <chr> <chr> <chr> <chr>
## 1 Italy Sicily & Sardinia Etna <NA>
## 2 Portugal Douro <NA> <NA>
## 3 US Oregon Willamette Valley Willamette Valley
## 4 US Michigan Lake Michigan Shore <NA>
## 5 US Oregon Willamette Valley Willamette Valley
## 6 Spain Northern Spain Navarra <NA>
## 7 Italy Sicily & Sardinia Vittoria <NA>
## 8 France Alsace Alsace <NA>
## 9 Germany Rheinhessen <NA> <NA>
## 10 France Alsace Alsace <NA>
## # ... with 129,961 more rows
wines%>%
rename( punkty = points)
## # A tibble: 129,971 x 14
## X1 country description designation punkty price province region_1
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 0 Italy Aromas inc~ Vulka Bian~ 87 NA Sicily ~ Etna
## 2 1 Portug~ This is ri~ Avidagos 87 15 Douro <NA>
## 3 2 US Tart and s~ <NA> 87 14 Oregon Willame~
## 4 3 US Pineapple ~ Reserve La~ 87 13 Michigan Lake Mi~
## 5 4 US Much like ~ Vintner's ~ 87 65 Oregon Willame~
## 6 5 Spain Blackberry~ Ars In Vit~ 87 15 Norther~ Navarra
## 7 6 Italy Here's a b~ Belsito 87 16 Sicily ~ Vittoria
## 8 7 France This dry a~ <NA> 87 24 Alsace Alsace
## 9 8 Germany Savory dri~ Shine 87 12 Rheinhe~ <NA>
## 10 9 France This has g~ Les Natures 87 27 Alsace Alsace
## # ... with 129,961 more rows, and 6 more variables: region_2 <chr>,
## # taster_name <chr>, taster_twitter_handle <chr>, title <chr>,
## # variety <chr>, winery <chr>
usd_to_pln = 3.95
wines<-wines%>%
mutate( price_pln = price * usd_to_pln)
wines
## # A tibble: 129,971 x 15
## X1 country description designation points price province region_1
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 0 Italy Aromas inc~ Vulka Bian~ 87 NA Sicily ~ Etna
## 2 1 Portug~ This is ri~ Avidagos 87 15 Douro <NA>
## 3 2 US Tart and s~ <NA> 87 14 Oregon Willame~
## 4 3 US Pineapple ~ Reserve La~ 87 13 Michigan Lake Mi~
## 5 4 US Much like ~ Vintner's ~ 87 65 Oregon Willame~
## 6 5 Spain Blackberry~ Ars In Vit~ 87 15 Norther~ Navarra
## 7 6 Italy Here's a b~ Belsito 87 16 Sicily ~ Vittoria
## 8 7 France This dry a~ <NA> 87 24 Alsace Alsace
## 9 8 Germany Savory dri~ Shine 87 12 Rheinhe~ <NA>
## 10 9 France This has g~ Les Natures 87 27 Alsace Alsace
## # ... with 129,961 more rows, and 7 more variables: region_2 <chr>,
## # taster_name <chr>, taster_twitter_handle <chr>, title <chr>,
## # variety <chr>, winery <chr>, price_pln <dbl>
wines%>%
summarise(mean_price = mean(price, na.rm = T),
std_price = sd(price, na.rm = T))
## # A tibble: 1 x 2
## mean_price std_price
## <dbl> <dbl>
## 1 35.4 41.0
quantile(wines$price, na.rm = T, probs = c(0, 0.1, 0.25, 0.50, 0.75, 0.9, 1))
## 0% 10% 25% 50% 75% 90% 100%
## 4 12 17 25 42 65 3300
wines%>%
summarise(median_price = median(price, na.rm = T))
## # A tibble: 1 x 1
## median_price
## <dbl>
## 1 25
Czy drogie wino oznacza dobre?
wines %>%
mutate(price_score_ratio = price_pln/points) %>%
select(title, price_pln, points, price_score_ratio) %>%
arrange(price_score_ratio)
## # A tibble: 129,971 x 4
## title price_pln points price_score_rat~
## <chr> <dbl> <dbl> <dbl>
## 1 Bandit NV Merlot (California) 15.8 86 0.184
## 2 Cramele Recas 2011 UnWineD Pinot Grig~ 15.8 86 0.184
## 3 Felix Solis 2013 Flirty Bird Syrah (V~ 15.8 85 0.186
## 4 Dancing Coyote 2015 White (Clarksburg) 15.8 85 0.186
## 5 Broke Ass 2009 Red Malbec-Syrah (Mend~ 15.8 84 0.188
## 6 Bandit NV Chardonnay (California) 15.8 84 0.188
## 7 Terrenal 2010 Cabernet Sauvignon (Yec~ 15.8 84 0.188
## 8 Bandit NV Merlot (California) 15.8 84 0.188
## 9 Terrenal 2010 Estate Bottled Temprani~ 15.8 84 0.188
## 10 Pam's Cuties NV Unoaked Chardonnay (C~ 15.8 83 0.190
## # ... with 129,961 more rows
wines %>%
mutate(price_score_ratio = price_pln/points) %>%
select(title, price_pln, points, price_score_ratio) %>%
filter(points >= 90) %>%
arrange(price_score_ratio)
## # A tibble: 49,045 x 4
## title price_pln points price_score_rat~
## <chr> <dbl> <dbl> <dbl>
## 1 Herdade dos Machados 2012 Toutalga Re~ 27.6 91 0.304
## 2 Snoqualmie 2006 Winemaker's Select Ri~ 31.6 91 0.347
## 3 Esser Cellars 2001 Chardonnay (Califo~ 31.6 90 0.351
## 4 Aveleda 2013 Quinta da Aveleda Estate~ 31.6 90 0.351
## 5 Rothbury Estate 2001 Chardonnay (Sout~ 31.6 90 0.351
## 6 Chateau Ste. Michelle 2011 Riesling (~ 35.6 91 0.391
## 7 Chateau Ste. Michelle 2010 Dry Riesli~ 35.6 91 0.391
## 8 Barnard Griffin 2012 Fumé Blanc Sauvi~ 35.6 91 0.391
## 9 Mano A Mano 2011 Tempranillo (Vino de~ 35.6 90 0.395
## 10 Aveleda 2014 Quinta da Aveleda Estate~ 35.6 90 0.395
## # ... with 49,035 more rows
Mediana ze względu na wartośc zmiennej coutry.
wines %>%
group_by(country) %>%
summarise(median_price_pln = median(price_pln, na.rm = T))
## # A tibble: 44 x 2
## country median_price_pln
## <chr> <dbl>
## 1 Argentina 67.2
## 2 Armenia 57.3
## 3 Australia 83.0
## 4 Austria 98.8
## 5 Bosnia and Herzegovina 49.4
## 6 Brazil 79
## 7 Bulgaria 51.4
## 8 Canada 118.
## 9 Chile 59.2
## 10 China 71.1
## # ... with 34 more rows
wines %>%
group_by(country) %>%
summarise(median_price_pln = median(price_pln, na.rm = T),
sred_punkty = mean(points, na.rm = T),
liczba_of_wines = n()) %>%
arrange(median_price_pln) %>%
filter(liczba_of_wines >= 20)
## # A tibble: 30 x 4
## country median_price_pln sred_punkty liczba_of_wines
## <chr> <dbl> <dbl> <int>
## 1 Romania 35.6 86.4 120
## 2 Bulgaria 51.4 87.9 141
## 3 Moldova 51.4 87.2 59
## 4 Chile 59.2 86.5 4472
## 5 Portugal 63.2 88.3 5691
## 6 Argentina 67.2 86.7 3800
## 7 Georgia 69.1 87.7 86
## 8 Morocco 71.1 88.6 28
## 9 Spain 71.1 87.3 6645
## 10 Greece 75.0 87.3 466
## # ... with 20 more rows
Ile jest w danych Hiszpańskich win droższych niż $100?
Który szczep wingoron (variety) jest najbardziej popularny?
Mając budżet $8 po wino jakiego szczepu (variety) najlepiej sięgnąć? Odrzuć szczepy dla których jest mniej niż 20 obserwacji
Usunięcie wszystkich z brakami
wines %>% drop_na()
## # A tibble: 22,387 x 15
## X1 country description designation points price province region_1
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 4 US Much like ~ Vintner's ~ 87 65 Oregon Willame~
## 2 10 US Soft, supp~ Mountain C~ 87 19 Califor~ Napa Va~
## 3 23 US This wine ~ Signature ~ 87 22 Califor~ Paso Ro~
## 4 25 US Oak and ea~ King Ridge~ 87 69 Califor~ Sonoma ~
## 5 35 US As with ma~ Hyland 86 50 Oregon McMinnv~
## 6 60 US Syrupy and~ Estate 86 100 Califor~ Napa Va~
## 7 62 US The aromas~ Alder Ridg~ 86 25 Washing~ Columbi~
## 8 64 US There are ~ Golden Horn 86 26 Califor~ Santa Y~
## 9 67 US A blend of~ Inspired 86 46 Washing~ Columbi~
## 10 71 US Big oak de~ Old Vine 86 40 Califor~ Alexand~
## # ... with 22,377 more rows, and 7 more variables: region_2 <chr>,
## # taster_name <chr>, taster_twitter_handle <chr>, title <chr>,
## # variety <chr>, winery <chr>, price_pln <dbl>
Usunięcie wszystkich obserwacji, gdzie wystepują braki w danej kolumnie
wines2<-wines %>% drop_na(price)
n = length(wines2$price)
y1=cut(wines2$price, sqrt(n))
# y1
head(table(y1),30)
## y1
## (0.704,13.5] (13.5,23] (23,32.5] (32.5,41.99] (41.99,51.49]
## 15821 35109 24764 14175 11411
## (51.49,60.99] (60.99,70.49] (70.49,79.99] (79.99,89.49] (89.49,98.99]
## 6457 3934 2063 1898 1308
## (98.99,108.5] (108.5,118] (118,127.5] (127.5,137] (137,146.5]
## 941 418 647 304 229
## (146.5,156] (156,165.5] (165.5,175] (175,184.5] (184.5,194]
## 313 121 63 129 53
## (194,203.5] (203.5,213] (213,222.5] (222.5,232] (232,241.5]
## 137 30 27 64 39
## (241.5,251] (251,260.5] (260.5,270] (270,279.5] (279.5,289]
## 66 30 7 28 13
y2=cut(wines2$price,breaks=c(1,20,100,300,500))
head(y2, 10)
## [1] (1,20] (1,20] (1,20] (20,100] (1,20] (1,20] (20,100]
## [8] (1,20] (20,100] (1,20]
## Levels: (1,20] (20,100] (100,300] (300,500]
levels(y2)=c("bardzo tanie", "tanie", "drogie", "bardzo drogie")
table(y2)
## y2
## bardzo tanie tanie drogie bardzo drogie
## 46341 71268 3050 225
rpivotTable(diamonds, subtotals=TRUE)
Made by:
Majkowska Agata
agata.majkowska@ug.edu.pl