knitr::opts_chunk$set(echo = TRUE, message=FALSE, warning = FALSE)
library(tidyr)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──

## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.6     ✓ dplyr   1.0.3
## ✓ readr   1.1.1     ✓ stringr 1.4.0
## ✓ ggplot2 3.3.3     ✓ forcats 0.3.0

## Warning: package 'stringr' was built under R version 3.5.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Summarize and Plot

rain <- read_csv("/Users/Rose/Desktop/Masters In Data Science/Data Science with R/rainfall.csv")

tidyr has been updated

Gather and spread are no longer pivot_wider() pivot_longer()

rain %>%
    rename("year" = "Year") %>%
    pivot_longer(-year, names_to = "month", values_to = "rainfall")

## # A tibble: 576 x 3
##     year month rainfall
##    <int> <chr>    <dbl>
##  1  1970 Jan      13.5 
##  2  1970 Feb       4.46
##  3  1970 Mar       1.92
##  4  1970 Apr       2.63
##  5  1970 May       1.36
##  6  1970 Jun       0.85
##  7  1970 Jul       0.01
##  8  1970 Aug      NA   
##  9  1970 Sep       1.81
## 10  1970 Oct       3.25
## # … with 566 more rows

#install.packages("tidyr")

winemag_data <- read_rds("/Users/Rose/Desktop/Masters In Data Science/Data Science with R/wine.rds")

Example

Load wine data
get rid of prices that are NA
only keep oregon wines
extract year from the title
Join with rainfall
pivot longer

wine_rain <- winemag_data %>%
  filter(!is.na(price) & province == "Oregon") %>%
  mutate(year = as.numeric(str_extract(title, "(\\d{4})"))) %>%
  left_join(rain, by = c("year" = "Year")) %>%
  pivot_longer(16:27, names_to = "month", values_to = "rainfall")

wine_rain %>%
    select(title, month, year, rainfall)

## # A tibble: 61,764 x 4
##    title                                         month  year rainfall
##    <chr>                                         <chr> <dbl>    <dbl>
##  1 Rainstorm 2013 Pinot Gris (Willamette Valley) Jan    2013     1.63
##  2 Rainstorm 2013 Pinot Gris (Willamette Valley) Feb    2013     1.42
##  3 Rainstorm 2013 Pinot Gris (Willamette Valley) Mar    2013     2.21
##  4 Rainstorm 2013 Pinot Gris (Willamette Valley) Apr    2013     2.39
##  5 Rainstorm 2013 Pinot Gris (Willamette Valley) May    2013     2.94
##  6 Rainstorm 2013 Pinot Gris (Willamette Valley) Jun    2013     1.02
##  7 Rainstorm 2013 Pinot Gris (Willamette Valley) Jul    2013     0   
##  8 Rainstorm 2013 Pinot Gris (Willamette Valley) Aug    2013     0.35
##  9 Rainstorm 2013 Pinot Gris (Willamette Valley) Sep    2013     7.05
## 10 Rainstorm 2013 Pinot Gris (Willamette Valley) Oct    2013     0.63
## # … with 61,754 more rows

change the NA to 0 then summarize
now we look and its 12 rows per wine because it takes into consideration, each month
in the slides there is a “join” guide
the right way of doing this, would be to summarize and flatten by year

rain <- rain %>%
    rename("year" = "Year") %>%
    pivot_longer(-year, names_to = "month", values_to = "rainfall")

head(rain)

## # A tibble: 6 x 3
##    year month rainfall
##   <int> <chr>    <dbl>
## 1  1970 Jan      13.5 
## 2  1970 Feb       4.46
## 3  1970 Mar       1.92
## 4  1970 Apr       2.63
## 5  1970 May       1.36
## 6  1970 Jun       0.85

winemag_data %>%
    left_join(population) %>%
    filter(!is.na(population) & !is.na(year)) %>%
    filter(population < 100000000) %>%
    group_by(country, year) %>%
    summarize(population = mean(population), price = mean(price))

## # A tibble: 348 x 4
## # Groups:   country [32]
##    country    year population price
##    <chr>     <dbl>      <dbl> <dbl>
##  1 Argentina  1999   36514558   8  
##  2 Argentina  2000   36903067  16.5
##  3 Argentina  2001   37273361  13.2
##  4 Argentina  2002   37627545  24  
##  5 Argentina  2003   37970411  35.3
##  6 Argentina  2004   38308779  45.3
##  7 Argentina  2005   38647854  23.1
##  8 Argentina  2006   38988923  22.4
##  9 Argentina  2007   39331357  24.2
## 10 Argentina  2008   39676083  25.9
## # … with 338 more rows

sometimes you want to do a bunch of if/else in your mutate all at once

rain <- rain %>%
    mutate(month_number = case_when(month == "Jan" ~ 1,
                                    month == "Feb" ~ 2,
                                    month == "Mar" ~ 3,
                                    month == "Apr" ~ 4,
                                    month == "May" ~ 5,
                                    month == "Jun" ~ 6,
                                    month == "Jul" ~ 7,
                                    month == "Aug" ~ 8,
                                    month == "Sep" ~ 9,
                                    month == "Oct" ~ 10,
                                    month == "Nov" ~ 11,
                                    month == "Dec" ~ 12))

use case_when and or str_detect() to create a new variable called character values of tart spricy bold and cherry for oregon wines with those terms in the description then plot the density log(price) by character

library(ggplot2)

winemag_data %>%
    filter(province == "Oregon") %>%
      mutate(character = 
             case_when(
                        str_detect(description, "[Tt]art") ~ "tart",
                        str_detect(description, "[Ss]picy") ~ "spicy",
                        str_detect(description, "[Bb]old") ~ "bold",
                        str_detect(description, "[Cc]herry") ~ "cherry")) %>%
    filter(!is.na(character)) %>%
            ggplot(., aes(x = price, fill = character)) + 
  geom_density(alpha = 0.5) + scale_x_log10()

#wine <- winemag_data %>%
 #   pivot_wider(names_from = "months", values_from = "rainfall") %>%
  #  mutate(character = 
   #           case_when(str_detect(description, "[Tt]art") ~ "tart",
    #                    str_detect(description, "[Ss]picy") ~ "spicy",
     #                   str_detect(description, "[Bb]old") ~ "bold",
      #                  str_detect(description, "[Cc]herry") ~ "cherry"))

aesthetics

x
y
fill
color

geometry

line plot
bar plot
Histograms
violin plots/box olots

Lines and scatter

geom_point()
geom_jitter() - scatter plot in random small jitters where it is place
geom_line()

Bar - counts or categorical levels

geom_bar()
geom_col()

Histogram - histogram buckets real value… density normalizes the data when we want to compare

geom_histogram()
- takes a count for the bins
geom_density()
- takes a % in bins for the count ## box
geom_box() - how much is in the main chunk, how many outliers do we have?
geom_violin() - marries box and density

wine1 <- winemag_data %>%
    filter(province == "Oregon") %>%
      mutate(character = 
             case_when(
                        str_detect(description, "[Tt]art") ~ "tart",
                        str_detect(description, "[Ss]picy") ~ "spicy",
                        str_detect(description, "[Bb]old") ~ "bold",
                        str_detect(description, "[Cc]herry") ~ "cherry")) %>%
    filter(!is.na(character))

wine1 %>%
    filter(year > 1995) %>%
    filter(!is.na(character)) %>%
    ggplot(aes(x = year, y = points, color = character)) + geom_jitter()

wine1 %>%
    filter(!is.na(character)) %>%
    ggplot(aes(character)) + geom_bar()

Plot the counts of each character of wine betwen 1995 and 2015

wine1 %>%
    filter(year > 1995 & year <= 2015) %>%
    filter(!is.na(character)) %>%
    group_by(year, character) %>%
    summarize(n = n())

## # A tibble: 59 x 3
## # Groups:   year [18]
##     year character     n
##    <dbl> <chr>     <int>
##  1  1996 tart          1
##  2  1997 tart          1
##  3  1998 bold          1
##  4  1998 cherry        8
##  5  1998 tart         12
##  6  1999 cherry        7
##  7  1999 spicy         7
##  8  1999 tart         11
##  9  2000 cherry        2
## 10  2000 spicy         1
## # … with 49 more rows

wine1 %>%
    ggplot(aes(price)) + geom_histogram()

wine1 %>%
    filter(!is.na(character)) %>%
    ggplot(aes(character, log(price), color = character)) + geom_violin()

Use any of the techniques that you’ve learned thus far to answer the following:

is there a relationaship between rainfall and wine quality in oregon?

library(skimr)
wine_rain %>%
    filter(year > 1995 & year <= 2015) %>%
    group_by(year) %>%
    skim(points)

## Skim summary statistics
##  n obs: 61764 
##  n variables: 17 
##  group variables: year 
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────
##  year variable missing complete     n  mean   sd p0   p25 p50   p75 p100
##  1996   points       0       48    48 88    1.6  86 86.75  88 89.25   90
##  1997   points       0       24    24 88    1.02 87 87     88 89      89
##  1998   points       0      684   684 87.28 3    81 85     88 90      93
##  1999   points       0      528   528 86.7  2.63 80 85     86 88.25   95
##  2000   points       0      504   504 87.05 3    80 86     87 89      94
##  2001   points       0      216   216 85.33 3.27 80 82     85 88      92
##  2002   points       0       36    36 86.67 1.26 85 85     87 88      88
##  2003   points       0       12    12 87    0    87 87     87 87      87
##  2004   points       0      240   240 87.4  2.09 83 85.75  88 88      91
##  2005   points       0     1200  1200 87.85 2.3  82 87     88 89      95
##  2006   points       0     2400  2400 88.14 2.56 80 87     88 90      95
##  2007   points       0     1956  1956 87.75 2.78 80 86     88 89      95
##  2008   points       0     2880  2880 87.94 2.78 81 86     88 90      95
##  2009   points       0     3720  3720 89.15 2.9  82 87     89 91      99
##  2010   points       0     3756  3756 88.46 2.85 81 86     88 91      95
##  2011   points       0     5100  5100 88.44 2.82 81 87     88 90      99
##  2012   points       0     9396  9396 89.46 2.8  81 87     90 92      97
##  2013   points       0     9372  9372 89.41 2.68 81 87     90 91      96
##  2014   points       0    12816 12816 89.77 2.66 82 88     90 92      96
##  2015   points       0     6876  6876 89.68 2.38 81 88     90 91      96
##      hist
##  ▇▇▁▁▁▇▁▇
##  ▇▁▁▁▁▁▁▇
##  ▂▃▃▅▃▇▁▂
##  ▁▁▆▇▃▂▁▁
##  ▂▁▃▇▅▃▂▁
##  ▃▇▂▇▃▆▁▃
##  ▇▁▁▁▁▇▁▇
##  ▁▁▁▇▁▁▁▁
##  ▁▅▁▃▇▁▂▂
##  ▁▁▂▇▃▁▁▁
##  ▁▂▂▃▇▅▂▁
##  ▁▂▂▅▇▃▁▁
##  ▁▂▅▇▅▅▃▁
##  ▁▃▇▇▆▂▁▁
##  ▁▂▇▇▃▆▅▁
##  ▁▃▆▇▃▁▁▁
##  ▁▂▆▇▇▆▂▁
##  ▁▁▃▇▇▇▃▁
##  ▁▂▅▇▃▇▃▁
##  ▁▁▂▇▇▇▂▁

#rains <- rainfall %>%
 #   rename("year" = "Year") %>%
  #  pivot_long(-year, names_to = 'month', values_to = 'rainfall') %>%
   # mutate(rainfall = ifelse(is.na(rainfall), 0, rainfall)) %>%
    #filter(month %in% c('May','Jun','Jul', 'Aug', 'Sep')) %>%
    #group_by(Year) %>%
    #summarise(summer_rain = sum(rainfall))

#wines <- wine %>%
 #   filter(points > 88) %>%
  #  group_by(year) %>%
   # summarize(avg_price= mean(price), avg_points = mean(points)) %>%
    #left_join(rains)

Pivot longer/wider & Joins

Matt

10/5/2019

Summarize and Plot

tidyr has been updated

Example

sometimes you want to do a bunch of if/else in your mutate all at once

aesthetics

geometry

Lines and scatter

Bar - counts or categorical levels

Histogram - histogram buckets real value… density normalizes the data when we want to compare

Use any of the techniques that you’ve learned thus far to answer the following:

is there a relationaship between rainfall and wine quality in oregon?