knitr::opts_chunk$set(echo = TRUE, message=FALSE, warning = FALSE)
library(tidyr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.6 ✓ dplyr 1.0.3
## ✓ readr 1.1.1 ✓ stringr 1.4.0
## ✓ ggplot2 3.3.3 ✓ forcats 0.3.0
## Warning: package 'stringr' was built under R version 3.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
rain <- read_csv("/Users/Rose/Desktop/Masters In Data Science/Data Science with R/rainfall.csv")
Gather and spread are no longer pivot_wider() pivot_longer()
rain %>%
rename("year" = "Year") %>%
pivot_longer(-year, names_to = "month", values_to = "rainfall")
## # A tibble: 576 x 3
## year month rainfall
## <int> <chr> <dbl>
## 1 1970 Jan 13.5
## 2 1970 Feb 4.46
## 3 1970 Mar 1.92
## 4 1970 Apr 2.63
## 5 1970 May 1.36
## 6 1970 Jun 0.85
## 7 1970 Jul 0.01
## 8 1970 Aug NA
## 9 1970 Sep 1.81
## 10 1970 Oct 3.25
## # … with 566 more rows
#install.packages("tidyr")
winemag_data <- read_rds("/Users/Rose/Desktop/Masters In Data Science/Data Science with R/wine.rds")
wine_rain <- winemag_data %>%
filter(!is.na(price) & province == "Oregon") %>%
mutate(year = as.numeric(str_extract(title, "(\\d{4})"))) %>%
left_join(rain, by = c("year" = "Year")) %>%
pivot_longer(16:27, names_to = "month", values_to = "rainfall")
wine_rain %>%
select(title, month, year, rainfall)
## # A tibble: 61,764 x 4
## title month year rainfall
## <chr> <chr> <dbl> <dbl>
## 1 Rainstorm 2013 Pinot Gris (Willamette Valley) Jan 2013 1.63
## 2 Rainstorm 2013 Pinot Gris (Willamette Valley) Feb 2013 1.42
## 3 Rainstorm 2013 Pinot Gris (Willamette Valley) Mar 2013 2.21
## 4 Rainstorm 2013 Pinot Gris (Willamette Valley) Apr 2013 2.39
## 5 Rainstorm 2013 Pinot Gris (Willamette Valley) May 2013 2.94
## 6 Rainstorm 2013 Pinot Gris (Willamette Valley) Jun 2013 1.02
## 7 Rainstorm 2013 Pinot Gris (Willamette Valley) Jul 2013 0
## 8 Rainstorm 2013 Pinot Gris (Willamette Valley) Aug 2013 0.35
## 9 Rainstorm 2013 Pinot Gris (Willamette Valley) Sep 2013 7.05
## 10 Rainstorm 2013 Pinot Gris (Willamette Valley) Oct 2013 0.63
## # … with 61,754 more rows
change the NA to 0 then summarize
the right way of doing this, would be to summarize and flatten by year
rain <- rain %>%
rename("year" = "Year") %>%
pivot_longer(-year, names_to = "month", values_to = "rainfall")
head(rain)
## # A tibble: 6 x 3
## year month rainfall
## <int> <chr> <dbl>
## 1 1970 Jan 13.5
## 2 1970 Feb 4.46
## 3 1970 Mar 1.92
## 4 1970 Apr 2.63
## 5 1970 May 1.36
## 6 1970 Jun 0.85
winemag_data %>%
left_join(population) %>%
filter(!is.na(population) & !is.na(year)) %>%
filter(population < 100000000) %>%
group_by(country, year) %>%
summarize(population = mean(population), price = mean(price))
## # A tibble: 348 x 4
## # Groups: country [32]
## country year population price
## <chr> <dbl> <dbl> <dbl>
## 1 Argentina 1999 36514558 8
## 2 Argentina 2000 36903067 16.5
## 3 Argentina 2001 37273361 13.2
## 4 Argentina 2002 37627545 24
## 5 Argentina 2003 37970411 35.3
## 6 Argentina 2004 38308779 45.3
## 7 Argentina 2005 38647854 23.1
## 8 Argentina 2006 38988923 22.4
## 9 Argentina 2007 39331357 24.2
## 10 Argentina 2008 39676083 25.9
## # … with 338 more rows
rain <- rain %>%
mutate(month_number = case_when(month == "Jan" ~ 1,
month == "Feb" ~ 2,
month == "Mar" ~ 3,
month == "Apr" ~ 4,
month == "May" ~ 5,
month == "Jun" ~ 6,
month == "Jul" ~ 7,
month == "Aug" ~ 8,
month == "Sep" ~ 9,
month == "Oct" ~ 10,
month == "Nov" ~ 11,
month == "Dec" ~ 12))
library(ggplot2)
winemag_data %>%
filter(province == "Oregon") %>%
mutate(character =
case_when(
str_detect(description, "[Tt]art") ~ "tart",
str_detect(description, "[Ss]picy") ~ "spicy",
str_detect(description, "[Bb]old") ~ "bold",
str_detect(description, "[Cc]herry") ~ "cherry")) %>%
filter(!is.na(character)) %>%
ggplot(., aes(x = price, fill = character)) +
geom_density(alpha = 0.5) + scale_x_log10()
#wine <- winemag_data %>%
# pivot_wider(names_from = "months", values_from = "rainfall") %>%
# mutate(character =
# case_when(str_detect(description, "[Tt]art") ~ "tart",
# str_detect(description, "[Ss]picy") ~ "spicy",
# str_detect(description, "[Bb]old") ~ "bold",
# str_detect(description, "[Cc]herry") ~ "cherry"))
wine1 <- winemag_data %>%
filter(province == "Oregon") %>%
mutate(character =
case_when(
str_detect(description, "[Tt]art") ~ "tart",
str_detect(description, "[Ss]picy") ~ "spicy",
str_detect(description, "[Bb]old") ~ "bold",
str_detect(description, "[Cc]herry") ~ "cherry")) %>%
filter(!is.na(character))
wine1 %>%
filter(year > 1995) %>%
filter(!is.na(character)) %>%
ggplot(aes(x = year, y = points, color = character)) + geom_jitter()
wine1 %>%
filter(!is.na(character)) %>%
ggplot(aes(character)) + geom_bar()
Plot the counts of each character of wine betwen 1995 and 2015
wine1 %>%
filter(year > 1995 & year <= 2015) %>%
filter(!is.na(character)) %>%
group_by(year, character) %>%
summarize(n = n())
## # A tibble: 59 x 3
## # Groups: year [18]
## year character n
## <dbl> <chr> <int>
## 1 1996 tart 1
## 2 1997 tart 1
## 3 1998 bold 1
## 4 1998 cherry 8
## 5 1998 tart 12
## 6 1999 cherry 7
## 7 1999 spicy 7
## 8 1999 tart 11
## 9 2000 cherry 2
## 10 2000 spicy 1
## # … with 49 more rows
wine1 %>%
ggplot(aes(price)) + geom_histogram()
wine1 %>%
filter(!is.na(character)) %>%
ggplot(aes(character, log(price), color = character)) + geom_violin()
library(skimr)
wine_rain %>%
filter(year > 1995 & year <= 2015) %>%
group_by(year) %>%
skim(points)
## Skim summary statistics
## n obs: 61764
## n variables: 17
## group variables: year
##
## ── Variable type:numeric ───────────────────────────────────────────────────────
## year variable missing complete n mean sd p0 p25 p50 p75 p100
## 1996 points 0 48 48 88 1.6 86 86.75 88 89.25 90
## 1997 points 0 24 24 88 1.02 87 87 88 89 89
## 1998 points 0 684 684 87.28 3 81 85 88 90 93
## 1999 points 0 528 528 86.7 2.63 80 85 86 88.25 95
## 2000 points 0 504 504 87.05 3 80 86 87 89 94
## 2001 points 0 216 216 85.33 3.27 80 82 85 88 92
## 2002 points 0 36 36 86.67 1.26 85 85 87 88 88
## 2003 points 0 12 12 87 0 87 87 87 87 87
## 2004 points 0 240 240 87.4 2.09 83 85.75 88 88 91
## 2005 points 0 1200 1200 87.85 2.3 82 87 88 89 95
## 2006 points 0 2400 2400 88.14 2.56 80 87 88 90 95
## 2007 points 0 1956 1956 87.75 2.78 80 86 88 89 95
## 2008 points 0 2880 2880 87.94 2.78 81 86 88 90 95
## 2009 points 0 3720 3720 89.15 2.9 82 87 89 91 99
## 2010 points 0 3756 3756 88.46 2.85 81 86 88 91 95
## 2011 points 0 5100 5100 88.44 2.82 81 87 88 90 99
## 2012 points 0 9396 9396 89.46 2.8 81 87 90 92 97
## 2013 points 0 9372 9372 89.41 2.68 81 87 90 91 96
## 2014 points 0 12816 12816 89.77 2.66 82 88 90 92 96
## 2015 points 0 6876 6876 89.68 2.38 81 88 90 91 96
## hist
## ▇▇▁▁▁▇▁▇
## ▇▁▁▁▁▁▁▇
## ▂▃▃▅▃▇▁▂
## ▁▁▆▇▃▂▁▁
## ▂▁▃▇▅▃▂▁
## ▃▇▂▇▃▆▁▃
## ▇▁▁▁▁▇▁▇
## ▁▁▁▇▁▁▁▁
## ▁▅▁▃▇▁▂▂
## ▁▁▂▇▃▁▁▁
## ▁▂▂▃▇▅▂▁
## ▁▂▂▅▇▃▁▁
## ▁▂▅▇▅▅▃▁
## ▁▃▇▇▆▂▁▁
## ▁▂▇▇▃▆▅▁
## ▁▃▆▇▃▁▁▁
## ▁▂▆▇▇▆▂▁
## ▁▁▃▇▇▇▃▁
## ▁▂▅▇▃▇▃▁
## ▁▁▂▇▇▇▂▁
#rains <- rainfall %>%
# rename("year" = "Year") %>%
# pivot_long(-year, names_to = 'month', values_to = 'rainfall') %>%
# mutate(rainfall = ifelse(is.na(rainfall), 0, rainfall)) %>%
#filter(month %in% c('May','Jun','Jul', 'Aug', 'Sep')) %>%
#group_by(Year) %>%
#summarise(summer_rain = sum(rainfall))
#wines <- wine %>%
# filter(points > 88) %>%
# group_by(year) %>%
# summarize(avg_price= mean(price), avg_points = mean(points)) %>%
#left_join(rains)