if(!require('lvplot')) {
  install.packages('lvplot')
  library('lvplot')
}
## Loading required package: lvplot
if(!require('ggbeeswarm')) {
  install.packages('ggbeeswarm')
  library('ggbeeswarm')
}
## Loading required package: ggbeeswarm
## Loading required package: ggplot2
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ✔ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggplot2)
library(nycflights13)

(Start)

7.3.4 Variation

1. Explore the distribution of each of the x, y, and z variables in diamonds. What do you learn? Think about a diamond and how you might decide which dimension is the length, width, and depth.

summary(select(diamonds, x, y,z))
##        x                y                z         
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 4.710   1st Qu.: 4.720   1st Qu.: 2.910  
##  Median : 5.700   Median : 5.710   Median : 3.530  
##  Mean   : 5.731   Mean   : 5.735   Mean   : 3.539  
##  3rd Qu.: 6.540   3rd Qu.: 6.540   3rd Qu.: 4.040  
##  Max.   :10.740   Max.   :58.900   Max.   :31.800

From this dataset you can see X & Y > Z and there are outliers. Some diamonds are values at 0.

filter(diamonds, x == 0 | y == 0 | z == 0)
## # A tibble: 20 × 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  1    Premium   G     SI2      59.1    59  3142  6.55  6.48     0
##  2  1.01 Premium   H     I1       58.1    59  3167  6.66  6.6      0
##  3  1.1  Premium   G     SI2      63      59  3696  6.5   6.47     0
##  4  1.01 Premium   F     SI2      59.2    58  3837  6.5   6.47     0
##  5  1.5  Good      G     I1       64      61  4731  7.15  7.04     0
##  6  1.07 Ideal     F     SI2      61.6    56  4954  0     6.62     0
##  7  1    Very Good H     VS2      63.3    53  5139  0     0        0
##  8  1.15 Ideal     G     VS2      59.2    56  5564  6.88  6.83     0
##  9  1.14 Fair      G     VS1      57.5    67  6381  0     0        0
## 10  2.18 Premium   H     SI2      59.4    61 12631  8.49  8.45     0
## 11  1.56 Ideal     G     VS2      62.2    54 12800  0     0        0
## 12  2.25 Premium   I     SI1      61.3    58 15397  8.52  8.42     0
## 13  1.2  Premium   D     VVS1     62.1    59 15686  0     0        0
## 14  2.2  Premium   H     SI1      61.2    59 17265  8.42  8.37     0
## 15  2.25 Premium   H     SI2      62.8    59 18034  0     0        0
## 16  2.02 Premium   H     VS2      62.7    53 18207  8.02  7.95     0
## 17  2.8  Good      G     SI2      63.8    58 18788  8.9   8.85     0
## 18  0.71 Good      F     SI2      64.1    60  2130  0     0        0
## 19  0.71 Good      F     SI2      64.1    60  2130  0     0        0
## 20  1.12 Premium   G     I1       60.4    59  2383  6.71  6.67     0
diamonds %>%
  arrange(desc(y)) %>%
  head()
## # A tibble: 6 × 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  2    Premium H     SI2      58.9    57 12210  8.09 58.9   8.06
## 2  0.51 Ideal   E     VS1      61.8    55  2075  5.15 31.8   5.12
## 3  5.01 Fair    J     I1       65.5    59 18018 10.7  10.5   6.98
## 4  4.5  Fair    J     I1       65.8    58 18531 10.2  10.2   6.72
## 5  4.01 Premium I     I1       61      61 15223 10.1  10.1   6.17
## 6  4.01 Premium J     I1       62.5    62 15223 10.0   9.94  6.24
diamonds %>%
 arrange(desc(z)) %>%
 head()
## # A tibble: 6 × 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.51 Very Good E     VS1      61.8  54.7  1970  5.12  5.15 31.8 
## 2  2    Premium   H     SI2      58.9  57   12210  8.09 58.9   8.06
## 3  5.01 Fair      J     I1       65.5  59   18018 10.7  10.5   6.98
## 4  4.5  Fair      J     I1       65.8  58   18531 10.2  10.2   6.72
## 5  4.13 Fair      H     I1       64.8  61   17329 10     9.85  6.43
## 6  3.65 Fair      H     I1       67.1  53   11668  9.53  9.48  6.38

Closer look at the right skew of (x,y,z)

filter(diamonds, x > 0, x < 10) %>%
  ggplot() +
  geom_histogram(mapping = aes(x = x), binwidth = 0.01) +
  scale_x_continuous(breaks = 1:10)

filter(diamonds, y > 0, y< 10) %>%
  ggplot() +
  geom_histogram(mapping = aes(x = y), binwidth= 0.01) +
  scale_x_continuous(breaks = 1:10)

Summarize the data, comparing the values of (x,y,z) which describe the length, width, and depth.

summarise(diamonds, mean(x>y), mean(x>z), mean(y>z))
## # A tibble: 1 × 3
##   `mean(x > y)` `mean(x > z)` `mean(y > z)`
##           <dbl>         <dbl>         <dbl>
## 1         0.434          1.00          1.00

Depth is always smaller the length or width. The shallower the diamond is the more affect it has the reflection of light.

2. Explore the distribution of price. Do you discover anything unusual or surprising? (Hint: Carefully think about the binwidth and make sure you try a wide range of values.)

ggplot(filter(diamonds, price < 2500), aes(x = price))+ 
  geom_histogram(binwidth= 10, center = 0)

ggplot(filter(diamonds), aes(x = price)) +
  geom_histogram(binwidth = 100, center =0)

3. How many diamonds are 0.99 carat? How many are 1 carat? What do you think is the cause of the difference?

diamonds %>%
  filter(carat >= 0.99, carat <= 1) %>%
  count(carat)
## # A tibble: 2 × 2
##   carat     n
##   <dbl> <int>
## 1  0.99    23
## 2  1     1558
diamonds %>%
  filter(carat >= 0.9, carat <= 1.1) %>%
  count(carat) %>%
  print(n= Inf)
## # A tibble: 21 × 2
##    carat     n
##    <dbl> <int>
##  1  0.9   1485
##  2  0.91   570
##  3  0.92   226
##  4  0.93   142
##  5  0.94    59
##  6  0.95    65
##  7  0.96   103
##  8  0.97    59
##  9  0.98    31
## 10  0.99    23
## 11  1     1558
## 12  1.01  2242
## 13  1.02   883
## 14  1.03   523
## 15  1.04   475
## 16  1.05   361
## 17  1.06   373
## 18  1.07   342
## 19  1.08   246
## 20  1.09   287
## 21  1.1    278

4. Compare and contrast coord_cartesian() vs xlim() or ylim() when zooming in on a histogram. What happens if you leave binwidth unset? What happens if you try and zoom so only half a bar shows?

ggplot(diamonds) +
  geom_histogram(mapping = aes(x = price)) +
  coord_cartesian(xlim = c(100, 5000), ylim = c(0, 3000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Vs.

ggplot(diamonds) +
  geom_histogram(mapping = aes(x = price)) +
  xlim(100, 5000) +
  ylim(0, 3000)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 14714 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).

7.4.1 Missing Values

1.What happens to missing values in a histogram? What happens to missing values in a bar chart? Why is there a difference?

Missing values are removed when the number of observations in each bin are calculated. See the warning message: Removed 9 rows containing non-finite values (stat_bin)

diamonds2 <- diamonds %>%
  mutate(y = ifelse(y < 3 | y > 20, NA, y))

ggplot(diamonds2, aes(x = y)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).

diamonds %>%
  mutate(cut = if_else(runif(n()) < 0.1, NA_character_, as.character(cut))) %>%
  ggplot() +
  geom_bar(mapping = aes(x = cut))

In a histogram, the x aesthetic variable needs to be numeric, and stat_bin() groups the observations by ranges into bins. Since the numeric value of the NA observations is unknown, they cannot be placed in a particular bin, and are dropped.In the geom_bar() function, NA is treated as another category. The x aesthetic in geom_bar() requires a discrete (categorical) variable, and missing values act like another category.

2. What does na.rm = TRUE do in mean() and sum()?

This option removes NA values from the vector prior to calculating the mean and sum.

mean(c(0, 1, 2, NA), na.rm = TRUE)
## [1] 1
sum(c(0, 1, 2, NA), na.rm = TRUE)
## [1] 3

In a histogram, the x aesthetic variable needs to be numeric, and stat_bin() groups the observations by ranges into bins. Since the numeric value of the NA observations is unknown, they cannot be placed in a particular bin, and are dropped.

7.5.1.1 Corvariation

1.Use what you’ve learned to improve the visualisation of the departure times of cancelled vs. non-cancelled flights.

nycflights13::flights %>%
  mutate(
    cancelled = is.na(dep_time),
    sched_hour = sched_dep_time %/% 100,
    sched_min = sched_dep_time %% 100,
    sched_dep_time = sched_hour + sched_min / 60
  ) %>%
  ggplot() +
  geom_boxplot(mapping = aes(y = sched_dep_time, x = cancelled))

2. What variable in the diamonds dataset is most important for predicting the price of a diamond? How is that variable correlated with cut? Why does the combination of those two relationships lead to lower quality diamonds being more expensive?

important variables: carat, clarity, color, and cut price and carat are continuous variables

diamonds %>%
  mutate(color = fct_rev(color)) %>%
  ggplot(aes(x = color, y = price)) +
  geom_boxplot()

There is weak negative relationship between clarity and price. The scale of clarity goes from I1 (worst) to IF (best).

ggplot(data = diamonds) +
  geom_boxplot(mapping = aes(x = clarity, y = price))

For both clarity and color, there is a much larger amount of variation within each category than between categories. Carat is clearly the single best predictor of diamond prices.

Now that we have established that carat appears to be the best predictor of price, what is the relationship between it and cut? Since this is an example of a continuous (carat) and categorical (cut) variable, it can be visualized with a box plot.

ggplot(diamonds, aes(x = cut, y = carat)) +
  geom_boxplot()

There is a lot of variability in the distribution of carat sizes within each cut category. There is a slight negative relationship between carat and cut. Noticeably, the largest carat diamonds have a cut of “Fair” (the lowest).

This negative relationship can be due to the way in which diamonds are selected for sale. A larger diamond can be profitably sold with a lower quality cut, while a smaller diamond requires a better cut.

3. Install the ggstance package, and create a horizontal boxplot. How does this compare to using coord_flip()?

ggplot(data = diamonds) +
  geom_boxplot(mapping = aes(x = reorder(cut, carat, FUN = median), y = carat)) +
    coord_flip()

4. One problem with boxplots is that they were developed in an era of much smaller datasets and tend to display a prohibitively large number of “outlying values”. One approach to remedy this problem is the letter value plot. Install the lvplot package, and try using geom_lv() to display the distribution of price vs cut. What do you learn? How do you interpret the plots?

Like box-plots, the boxes of the letter-value plot correspond to quantiles. However, they incorporate far more quantiles than box-plots. They are useful for larger datasets because,

  • larger datasets can give precise estimates of quantiles beyond the quartiles, and
  • in expectation, larger datasets should have more outliers (in absolute numbers).
ggplot(diamonds, aes(x = cut, y = price)) +
  geom_lv()

geom_lv shows the distrubtion of prices by cut a similar shape of tower of babble by the width.

5. Compare and contrast geom_violin() with a facetted geom_histogram(), or a coloured geom_freqpoly(). What are the pros and cons of each method?

ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
  geom_freqpoly(mapping = aes(color = cut), binwidth = 500)

ggplot(data = diamonds, mapping = aes(x = price)) +
  geom_histogram() +
  facet_wrap(~cut, ncol = 1, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
  geom_violin() +
  coord_flip()

6. If you have a small dataset, it’s sometimes useful to use geom_jitter() to see the relationship between a continuous and categorical variable. The ggbeeswarm package provides a number of methods similar to geom_jitter(). List them and briefly describe what each one does.

ggplot(data = mpg) +
  geom_quasirandom(
    mapping = aes(
      x = reorder(class, hwy, FUN = median),
      y = hwy
    ),
    method = "frowney" )

ggplot(data = mpg) +
  geom_beeswarm(mapping = aes(
    x = reorder(class, hwy, FUN = median),
    y = hwy
  ))

7.5.2.1 Two Categorical Variables

1. How could you rescale the count dataset above to more clearly show the distribution of cut within colour, or colour within cut?

diamonds %>%
  count(color, cut) %>%
  group_by(color) %>%
  mutate(prop = n / sum(n)) %>%
  ggplot(mapping = aes(x = color, y = cut)) +
  geom_tile(mapping = aes(fill = prop))

diamonds %>%
  count(color, cut) %>%
  group_by(cut) %>%
  mutate(prop = n / sum(n)) %>%
  ggplot(mapping = aes(x = color, y = cut)) +
  geom_tile(mapping = aes(fill = prop))

2. Use geom_tile() together with dplyr to explore how average flight delays vary by destination and month of year. What makes the plot difficult to read? How could you improve it?

flights %>%
  group_by(month, dest) %>%
  summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  ggplot(aes(x = factor(month), y = dest, fill = dep_delay)) +
  geom_tile() +
  labs(x = "Month", y = "Destination", fill = "Departure Delay")
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

flights %>%
  group_by(month, dest) %>%                                 # This gives us (month, dest) pairs
  summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  group_by(dest) %>%                                        # group all (month, dest) pairs by dest ..
  filter(n() == 12) %>%                                     # and only select those that have one entry per month 
  ungroup() %>%
  mutate(dest = reorder(dest, dep_delay)) %>%
  ggplot(aes(x = factor(month), y = dest, fill = dep_delay)) +
  geom_tile() +
  labs(x = "Month", y = "Destination", fill = "Departure Delay")
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

3. Why is it slightly better to use aes(x = color, y = cut) rather than aes(x = cut, y = color) in the example above?

diamonds %>%
  count(color, cut) %>%
  ggplot(mapping = aes(y = color, x = cut)) +
  geom_tile(mapping = aes(fill = n))

7.5.3.1 Two Continuous variables

1. Instead of summarising the conditional distribution with a boxplot, you could use a frequency polygon. What do you need to consider when using cut_width() vs cut_number()? How does that impact a visualisation of the 2d distribution of carat and price?

ggplot(
  data = diamonds,
  mapping = aes(color = cut_number(carat, 5), x = price)
) +
  geom_freqpoly() +
  labs(x = "Price", y = "Count", color = "Carat")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(
  data = diamonds,
  mapping = aes(color = cut_width(carat, 1, boundary = 0), x = price)
) +
  geom_freqpoly() +
  labs(x = "Price", y = "Count", color = "Carat")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

2. Visualise the distribution of carat, partitioned by price.

ggplot(diamonds, aes(x = cut_number(price, 10), y = carat)) +
  geom_boxplot() +
  coord_flip() +
  xlab("Price")

3. How does the price distribution of very large diamonds compare to small diamonds? Is it as you expect, or does it surprise you?

4. Combine two of the techniques you’ve learned to visualise the combined distribution of cut, carat, and price.

ggplot(diamonds, aes(x = cut_number(carat, 5), y = price, colour = cut)) +
  geom_boxplot()

5. Two dimensional plots reveal outliers that are not visible in one dimensional plots. For example, some points in the plot below have an unusual combination of x and y values, which makes the points outliers even though their x and y values appear normal when examined separately.

ggplot(data = diamonds) +
  geom_point(mapping = aes(x = x, y = y)) +
  coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))

  • Why is a scatterplot a better display than a binned plot for this case?

In this case, there is a strong relationship between
x and y. The outliers in this case are not extreme in either x or y. A binned plot would not reveal these outliers, and may lead us to conclude that the largest value of x was an outlier even though it appears to fit the bivariate pattern well.