knitr::opts_chunk$set(echo = TRUE, message=FALSE, warning = FALSE)
library(tidyr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.6     ✓ dplyr   1.0.3
## ✓ readr   1.1.1     ✓ stringr 1.4.0
## ✓ ggplot2 3.3.3     ✓ forcats 0.3.0
## Warning: package 'stringr' was built under R version 3.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Summarize and Plot

rain <- read_csv("/Users/Rose/Desktop/Masters In Data Science/Data Science with R/rainfall.csv")

tidyr has been updated

Gather and spread are no longer pivot_wider() pivot_longer()

rain %>%
    rename("year" = "Year") %>%
    pivot_longer(-year, names_to = "month", values_to = "rainfall")
## # A tibble: 576 x 3
##     year month rainfall
##    <int> <chr>    <dbl>
##  1  1970 Jan      13.5 
##  2  1970 Feb       4.46
##  3  1970 Mar       1.92
##  4  1970 Apr       2.63
##  5  1970 May       1.36
##  6  1970 Jun       0.85
##  7  1970 Jul       0.01
##  8  1970 Aug      NA   
##  9  1970 Sep       1.81
## 10  1970 Oct       3.25
## # … with 566 more rows
#install.packages("tidyr")
winemag_data <- read_rds("/Users/Rose/Desktop/Masters In Data Science/Data Science with R/wine.rds")

Example

  1. Load wine data
  2. get rid of prices that are NA
  3. only keep oregon wines
  4. extract year from the title
  5. Join with rainfall
  6. pivot longer
wine_rain <- winemag_data %>%
  filter(!is.na(price) & province == "Oregon") %>%
  mutate(year = as.numeric(str_extract(title, "(\\d{4})"))) %>%
  left_join(rain, by = c("year" = "Year")) %>%
  pivot_longer(16:27, names_to = "month", values_to = "rainfall")
wine_rain %>%
    select(title, month, year, rainfall)
## # A tibble: 61,764 x 4
##    title                                         month  year rainfall
##    <chr>                                         <chr> <dbl>    <dbl>
##  1 Rainstorm 2013 Pinot Gris (Willamette Valley) Jan    2013     1.63
##  2 Rainstorm 2013 Pinot Gris (Willamette Valley) Feb    2013     1.42
##  3 Rainstorm 2013 Pinot Gris (Willamette Valley) Mar    2013     2.21
##  4 Rainstorm 2013 Pinot Gris (Willamette Valley) Apr    2013     2.39
##  5 Rainstorm 2013 Pinot Gris (Willamette Valley) May    2013     2.94
##  6 Rainstorm 2013 Pinot Gris (Willamette Valley) Jun    2013     1.02
##  7 Rainstorm 2013 Pinot Gris (Willamette Valley) Jul    2013     0   
##  8 Rainstorm 2013 Pinot Gris (Willamette Valley) Aug    2013     0.35
##  9 Rainstorm 2013 Pinot Gris (Willamette Valley) Sep    2013     7.05
## 10 Rainstorm 2013 Pinot Gris (Willamette Valley) Oct    2013     0.63
## # … with 61,754 more rows
rain <- rain %>%
    rename("year" = "Year") %>%
    pivot_longer(-year, names_to = "month", values_to = "rainfall")

head(rain)
## # A tibble: 6 x 3
##    year month rainfall
##   <int> <chr>    <dbl>
## 1  1970 Jan      13.5 
## 2  1970 Feb       4.46
## 3  1970 Mar       1.92
## 4  1970 Apr       2.63
## 5  1970 May       1.36
## 6  1970 Jun       0.85
winemag_data %>%
    left_join(population) %>%
    filter(!is.na(population) & !is.na(year)) %>%
    filter(population < 100000000) %>%
    group_by(country, year) %>%
    summarize(population = mean(population), price = mean(price))
## # A tibble: 348 x 4
## # Groups:   country [32]
##    country    year population price
##    <chr>     <dbl>      <dbl> <dbl>
##  1 Argentina  1999   36514558   8  
##  2 Argentina  2000   36903067  16.5
##  3 Argentina  2001   37273361  13.2
##  4 Argentina  2002   37627545  24  
##  5 Argentina  2003   37970411  35.3
##  6 Argentina  2004   38308779  45.3
##  7 Argentina  2005   38647854  23.1
##  8 Argentina  2006   38988923  22.4
##  9 Argentina  2007   39331357  24.2
## 10 Argentina  2008   39676083  25.9
## # … with 338 more rows

sometimes you want to do a bunch of if/else in your mutate all at once

rain <- rain %>%
    mutate(month_number = case_when(month == "Jan" ~ 1,
                                    month == "Feb" ~ 2,
                                    month == "Mar" ~ 3,
                                    month == "Apr" ~ 4,
                                    month == "May" ~ 5,
                                    month == "Jun" ~ 6,
                                    month == "Jul" ~ 7,
                                    month == "Aug" ~ 8,
                                    month == "Sep" ~ 9,
                                    month == "Oct" ~ 10,
                                    month == "Nov" ~ 11,
                                    month == "Dec" ~ 12))
  1. use case_when and or str_detect() to create a new variable called character values of tart spricy bold and cherry for oregon wines with those terms in the description then plot the density log(price) by character
library(ggplot2)
winemag_data %>%
    filter(province == "Oregon") %>%
      mutate(character = 
             case_when(
                        str_detect(description, "[Tt]art") ~ "tart",
                        str_detect(description, "[Ss]picy") ~ "spicy",
                        str_detect(description, "[Bb]old") ~ "bold",
                        str_detect(description, "[Cc]herry") ~ "cherry")) %>%
    filter(!is.na(character)) %>%
            ggplot(., aes(x = price, fill = character)) + 
  geom_density(alpha = 0.5) + scale_x_log10()

#wine <- winemag_data %>%
 #   pivot_wider(names_from = "months", values_from = "rainfall") %>%
  #  mutate(character = 
   #           case_when(str_detect(description, "[Tt]art") ~ "tart",
    #                    str_detect(description, "[Ss]picy") ~ "spicy",
     #                   str_detect(description, "[Bb]old") ~ "bold",
      #                  str_detect(description, "[Cc]herry") ~ "cherry")) 

aesthetics

  • x
  • y
  • fill
  • color

geometry

  • line plot
  • bar plot
  • Histograms
  • violin plots/box olots

Lines and scatter

  • geom_point()
  • geom_jitter() - scatter plot in random small jitters where it is place
  • geom_line()

Bar - counts or categorical levels

  • geom_bar()
  • geom_col()

Histogram - histogram buckets real value… density normalizes the data when we want to compare

  • geom_histogram()
    • takes a count for the bins
  • geom_density()
    • takes a % in bins for the count ## box
  • geom_box() - how much is in the main chunk, how many outliers do we have?
  • geom_violin() - marries box and density
wine1 <- winemag_data %>%
    filter(province == "Oregon") %>%
      mutate(character = 
             case_when(
                        str_detect(description, "[Tt]art") ~ "tart",
                        str_detect(description, "[Ss]picy") ~ "spicy",
                        str_detect(description, "[Bb]old") ~ "bold",
                        str_detect(description, "[Cc]herry") ~ "cherry")) %>%
    filter(!is.na(character))
wine1 %>%
    filter(year > 1995) %>%
    filter(!is.na(character)) %>%
    ggplot(aes(x = year, y = points, color = character)) + geom_jitter()

wine1 %>%
    filter(!is.na(character)) %>%
    ggplot(aes(character)) + geom_bar()

Plot the counts of each character of wine betwen 1995 and 2015

wine1 %>%
    filter(year > 1995 & year <= 2015) %>%
    filter(!is.na(character)) %>%
    group_by(year, character) %>%
    summarize(n = n())
## # A tibble: 59 x 3
## # Groups:   year [18]
##     year character     n
##    <dbl> <chr>     <int>
##  1  1996 tart          1
##  2  1997 tart          1
##  3  1998 bold          1
##  4  1998 cherry        8
##  5  1998 tart         12
##  6  1999 cherry        7
##  7  1999 spicy         7
##  8  1999 tart         11
##  9  2000 cherry        2
## 10  2000 spicy         1
## # … with 49 more rows
wine1 %>%
    ggplot(aes(price)) + geom_histogram()

wine1 %>%
    filter(!is.na(character)) %>%
    ggplot(aes(character, log(price), color = character)) + geom_violin()

Use any of the techniques that you’ve learned thus far to answer the following:

is there a relationaship between rainfall and wine quality in oregon?

library(skimr)
wine_rain %>%
    filter(year > 1995 & year <= 2015) %>%
    group_by(year) %>%
    skim(points)
## Skim summary statistics
##  n obs: 61764 
##  n variables: 17 
##  group variables: year 
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────
##  year variable missing complete     n  mean   sd p0   p25 p50   p75 p100
##  1996   points       0       48    48 88    1.6  86 86.75  88 89.25   90
##  1997   points       0       24    24 88    1.02 87 87     88 89      89
##  1998   points       0      684   684 87.28 3    81 85     88 90      93
##  1999   points       0      528   528 86.7  2.63 80 85     86 88.25   95
##  2000   points       0      504   504 87.05 3    80 86     87 89      94
##  2001   points       0      216   216 85.33 3.27 80 82     85 88      92
##  2002   points       0       36    36 86.67 1.26 85 85     87 88      88
##  2003   points       0       12    12 87    0    87 87     87 87      87
##  2004   points       0      240   240 87.4  2.09 83 85.75  88 88      91
##  2005   points       0     1200  1200 87.85 2.3  82 87     88 89      95
##  2006   points       0     2400  2400 88.14 2.56 80 87     88 90      95
##  2007   points       0     1956  1956 87.75 2.78 80 86     88 89      95
##  2008   points       0     2880  2880 87.94 2.78 81 86     88 90      95
##  2009   points       0     3720  3720 89.15 2.9  82 87     89 91      99
##  2010   points       0     3756  3756 88.46 2.85 81 86     88 91      95
##  2011   points       0     5100  5100 88.44 2.82 81 87     88 90      99
##  2012   points       0     9396  9396 89.46 2.8  81 87     90 92      97
##  2013   points       0     9372  9372 89.41 2.68 81 87     90 91      96
##  2014   points       0    12816 12816 89.77 2.66 82 88     90 92      96
##  2015   points       0     6876  6876 89.68 2.38 81 88     90 91      96
##      hist
##  ▇▇▁▁▁▇▁▇
##  ▇▁▁▁▁▁▁▇
##  ▂▃▃▅▃▇▁▂
##  ▁▁▆▇▃▂▁▁
##  ▂▁▃▇▅▃▂▁
##  ▃▇▂▇▃▆▁▃
##  ▇▁▁▁▁▇▁▇
##  ▁▁▁▇▁▁▁▁
##  ▁▅▁▃▇▁▂▂
##  ▁▁▂▇▃▁▁▁
##  ▁▂▂▃▇▅▂▁
##  ▁▂▂▅▇▃▁▁
##  ▁▂▅▇▅▅▃▁
##  ▁▃▇▇▆▂▁▁
##  ▁▂▇▇▃▆▅▁
##  ▁▃▆▇▃▁▁▁
##  ▁▂▆▇▇▆▂▁
##  ▁▁▃▇▇▇▃▁
##  ▁▂▅▇▃▇▃▁
##  ▁▁▂▇▇▇▂▁
#rains <- rainfall %>%
 #   rename("year" = "Year") %>%
  #  pivot_long(-year, names_to = 'month', values_to = 'rainfall') %>%
   # mutate(rainfall = ifelse(is.na(rainfall), 0, rainfall)) %>%
    #filter(month %in% c('May','Jun','Jul', 'Aug', 'Sep')) %>%
    #group_by(Year) %>%
    #summarise(summer_rain = sum(rainfall))
#wines <- wine %>%
 #   filter(points > 88) %>%
  #  group_by(year) %>%
   # summarize(avg_price= mean(price), avg_points = mean(points)) %>%
    #left_join(rains)