Introduction

Question2

Variation

Visualizing distribution

diamonds %>%
    ggplot(aes(x = cut)) +
    geom_bar(mapping = aes(x = cut))

diamonds %>% 
  count(cut)
## # A tibble: 5 × 2
##   cut           n
##   <ord>     <int>
## 1 Fair       1610
## 2 Good       4906
## 3 Very Good 12082
## 4 Premium   13791
## 5 Ideal     21551
    ggplot(data = diamonds) +
  geom_histogram(mapping = aes(x = carat), binwidth = 0.5)

diamonds %>% 
  count(cut_width(carat, 0.5))
## # A tibble: 11 × 2
##    `cut_width(carat, 0.5)`     n
##    <fct>                   <int>
##  1 [-0.25,0.25]              785
##  2 (0.25,0.75]             29498
##  3 (0.75,1.25]             15977
##  4 (1.25,1.75]              5313
##  5 (1.75,2.25]              2002
##  6 (2.25,2.75]               322
##  7 (2.75,3.25]                32
##  8 (3.25,3.75]                 5
##  9 (3.75,4.25]                 4
## 10 (4.25,4.75]                 1
## 11 (4.75,5.25]                 1
smaller <- diamonds %>% 
  filter(carat < 3)
  
ggplot(data = smaller, mapping = aes(x = carat)) +
  geom_histogram(binwidth = 0.1)

diamonds %>%
    ggplot(aes(x = carat, color = cut)) +
    geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values

diamonds %>%
    
    # Filter out diamonds > 3 carat
    filter(carat > 3) %>%
    
    #Plot
    ggplot(aes(x = carat)) +
    geom_histogram(binwidth = 0.01)

faithful %>%
    ggplot(data = faithful, mapping = aes(x = eruptions)) +
    geom_histogram(binwidth = 0.25)

Unusual Values

diamonds %>%
    ggplot(aes(y)) + 
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

diamonds %>%
    ggplot(aes(y)) + 
    geom_histogram() +
    coord_cartesian(ylim = c(0,50))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Missing Values

diamonds %>%
    
    # filter(y < 3 | y > 20) %>%
    
    mutate(y = ifelse(y < 3 | y > 20, NA, y)) %>%
    
    # Plot
    ggplot(aes(x = x, y = y)) +
    geom_point()
## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_point()`).

Covariation

A categorical an continuous variable

diamonds %>%
    
    ggplot(aes(x = cut, y = price)) +
    geom_boxplot()

Two categorical variables

diamonds %>%
    
    count(color, cut) %>%
    
    ggplot(aes(x = color, y = cut, fill = n)) +
    geom_tile()

Two continous variables

library(hexbin)
diamonds %>%
    ggplot(aes(x = carat, y= price)) +
    geom_hex()

diamonds %>%
    filter(carat < 3) %>%
    ggplot(aes(x= carat, y = price)) +
    geom_boxplot(aes(group = cut_width(carat, 0.1)))

Patterns and models

library(modelr)
mod <- lm(log(price) ~ log(carat), data = diamonds)

diamonds2 <- diamonds %>%
modelr::add_residuals(mod) %>%
    mutate(resid = exp(resid))

diamonds2 %>%
    ggplot(aes(carat,resid)) +
    geom_point()

diamonds2 %>%
    ggplot(aes(cut, resid)) +
    geom_boxplot()