diamonds %>%
ggplot(aes(x = cut)) +
geom_bar()
ggplot(data = diamonds) +
geom_histogram(mapping = aes(x = carat), binwidth = 0.5)
smaller <- diamonds %>%
filter(carat < 3)
ggplot(data = smaller, mapping = aes(x = carat)) +
geom_histogram(binwidth = 0.1)
ggplot(data = smaller, mapping = aes(x = carat, colour = cut)) +
geom_freqpoly(binwidth = 0.1)
ggplot(data = smaller, mapping = aes(x = carat)) +
geom_histogram(binwidth = 0.01)
ggplot(data = faithful, mapping = aes(x = eruptions)) +
geom_histogram(binwidth = 0.25)
ggplot(diamonds) +
geom_histogram(mapping = aes(x = y), binwidth = 0.5)
ggplot(diamonds) +
geom_histogram(mapping = aes(x = y), binwidth = 0.5) +
coord_cartesian(ylim = c(0, 50))
summary(select(diamonds, x, y, z))
## x y z
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 4.710 1st Qu.: 4.720 1st Qu.: 2.910
## Median : 5.700 Median : 5.710 Median : 3.530
## Mean : 5.731 Mean : 5.735 Mean : 3.539
## 3rd Qu.: 6.540 3rd Qu.: 6.540 3rd Qu.: 4.040
## Max. :10.740 Max. :58.900 Max. :31.800
#> x y z
#> Min. : 0.00 Min. : 0.0 Min. : 0.0
#> 1st Qu.: 4.71 1st Qu.: 4.7 1st Qu.: 2.9
#> Median : 5.70 Median : 5.7 Median : 3.5
#> Mean : 5.73 Mean : 5.7 Mean : 3.5
#> 3rd Qu.: 6.54 3rd Qu.: 6.5 3rd Qu.: 4.0
#> Max. :10.74 Max. :58.9 Max. :31.8
ggplot(diamonds) +
geom_histogram(mapping = aes(x = x), binwidth = 0.01)
ggplot(diamonds) +
geom_histogram(mapping = aes(x = y), binwidth = 0.01)
ggplot(diamonds) +
geom_histogram(mapping = aes(x = z), binwidth = 0.01)
To answer the question that the depth z is always smaller than the length x or the width y. Length is more than width in less than half the observations
ggplot(filter(diamonds, price < 2500), aes(x = price)) +
geom_histogram(binwidth = 10, center = 0)
diamonds %>%
filter(carat >= 0.99, carat <= 1) %>%
count(carat)
## # A tibble: 2 × 2
## carat n
## <dbl> <int>
## 1 0.99 23
## 2 1 1558
#> # A tibble: 2 x 2
#> carat n
#> <dbl> <int>
#> 1 0.99 23
#> 2 1 1558
diamonds %>%
filter(carat >= 0.9, carat <= 1.1) %>%
count(carat) %>%
print(n = Inf)
## # A tibble: 21 × 2
## carat n
## <dbl> <int>
## 1 0.9 1485
## 2 0.91 570
## 3 0.92 226
## 4 0.93 142
## 5 0.94 59
## 6 0.95 65
## 7 0.96 103
## 8 0.97 59
## 9 0.98 31
## 10 0.99 23
## 11 1 1558
## 12 1.01 2242
## 13 1.02 883
## 14 1.03 523
## 15 1.04 475
## 16 1.05 361
## 17 1.06 373
## 18 1.07 342
## 19 1.08 246
## 20 1.09 287
## 21 1.1 278
#> # A tibble: 21 x 2
#> carat n
#> <dbl> <int>
#> 1 0.9 1485
#> 2 0.91 570
#> 3 0.92 226
#> 4 0.93 142
#> 5 0.94 59
#> 6 0.95 65
#> 7 0.96 103
#> 8 0.97 59
#> 9 0.98 31
#> 10 0.99 23
#> 11 1 1558
#> 12 1.01 2242
#> 13 1.02 883
#> 14 1.03 523
#> 15 1.04 475
#> 16 1.05 361
#> 17 1.06 373
#> 18 1.07 342
#> 19 1.08 246
#> 20 1.09 287
#> 21 1.1 278
The coord_cartesian() function zooms in on the area specified by the limits, after having calculated and drawn the geoms. Since the histogram bins have already been calculated, it is unaffected.
ggplot(diamonds) +
geom_histogram(mapping = aes(x = price)) +
coord_cartesian(xlim = c(100, 5000), ylim = c(0, 3000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
However, the xlim() and ylim() functions influence actions before the calculation of the stats related to the histogram. Thus, any values outside the x- and y-limits are dropped before calculating bin widths and counts. This can influence how the histogram looks.
ggplot(diamonds) +
geom_histogram(mapping = aes(x = price)) +
xlim(100, 5000) +
ylim(0, 3000)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 14714 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_bar()`).
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> Warning: Removed 14714 rows containing non-finite values (stat_bin).
#> Warning: Removed 6 rows containing missing values (geom_bar).
diamonds2 <- diamonds %>%
filter(between(y, 3, 20))
diamonds2 <- diamonds %>%
mutate(y = ifelse(y < 3 | y > 20, NA, y))
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
geom_point()
## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_point()`).
#> Warning: Removed 9 rows containing missing values (`geom_point()`).
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
geom_point(na.rm = TRUE)
nycflights13::flights %>%
mutate(
cancelled = is.na(dep_time),
sched_hour = sched_dep_time %/% 100,
sched_min = sched_dep_time %% 100,
sched_dep_time = sched_hour + sched_min / 60
) %>%
ggplot(mapping = aes(sched_dep_time)) +
geom_freqpoly(mapping = aes(colour = cancelled), binwidth = 1/4)
Exercises 1. What happens to missing values in a histogram? What happens to missing values in a bar chart? Why is there a difference? Missing values are removed when the number of observations in each bin are calculated.
##Covariation
ggplot(data = diamonds, mapping = aes(x = price)) +
geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)
ggplot(diamonds) +
geom_bar(mapping = aes(x = cut))
ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#> Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
#> ℹ Please use `after_stat(density)` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
geom_boxplot()
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
geom_boxplot()
ggplot(data = mpg) +
geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))
ggplot(data = mpg) +
geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
coord_flip()
Exercises 1. Use what you’ve learned to improve the visualisation of the departure times of cancelled vs. non-cancelled flights.
nycflights13::flights %>%
mutate(
cancelled = is.na(dep_time),
sched_hour = sched_dep_time %/% 100,
sched_min = sched_dep_time %% 100,
sched_dep_time = sched_hour + sched_min / 60
) %>%
ggplot() +
geom_boxplot(mapping = aes(y = sched_dep_time, x = cancelled))
2.What variable in the diamonds dataset is most important for predicting the price of a diamond? How is that variable correlated with cut? Why does the combination of those two relationships lead to lower quality diamonds being more expensive? This negative relationship can be due to the way in which diamonds are selected for sale. A larger diamond can be profitably sold with a lower quality cut, while a smaller diamond requires a better cut.
ggplot(data = mpg) +
geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
coord_flip()
4.One problem with box plots is that they were developed in an era of much smaller datasets and tend to display a prohibitively large number of “outlying values”. One approach to remedy this problem is the letter value plot. Install the lvplot package, and try using geom_lv() to display the distribution of price vs cut. What do you learn?
How do you interpret the plots?
Like box-plots, the boxes of the letter-value plot correspond to quantiles. However, they incorporate far more quantiles than box-plots. They are useful for larger datasets.
5.Compare and contrast geom_violin() with a faceted geom_histogram(), or a colored geom_freqpoly(). What are the pros and cons of each method?
ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
geom_freqpoly(mapping = aes(color = cut), binwidth = 500)
ggplot(data = diamonds, mapping = aes(x = price)) +
geom_histogram() +
facet_wrap(~cut, ncol = 1, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
geom_violin() +
coord_flip()
The geom_freqpoly() is better for look-up: meaning that given a price,
it is easy to tell which cut has the highest density. However, the
overlapping lines makes it difficult to distinguish how the overall
distributions relate to each other The geom_violin() and faceted
geom_histogram() have similar strengths and weaknesses. It is easy to
visually distinguish differences in the overall shape of the
distributions (skewness, central values, variance, etc). However, since
we can’t easily compare the vertical values of the distribution, it is
difficult to look up which category has the highest density for a given
price. 6. If you have a small dataset, it’s sometimes useful to use
geom_jitter() to see the relationship between a continuous and
categorical variable. The ggbeeswarm package provides a number of
methods similar to geom_jitter(). List them and briefly describe what
each one does.
geom_quasirandom() produces plots that are a mix of jitter and violin plots. There are several different methods that determine exactly how the random location of the points is generated. geom_beeswarm() produces a plot similar to a violin plot, but by offsetting the points.
ggplot(data = diamonds) +
geom_count(mapping = aes(x = cut, y = color))
diamonds %>%
count(color, cut) %>%
ggplot(mapping = aes(x = color, y = cut)) +
geom_tile(mapping = aes(fill = n))
Exercises
diamonds %>%
count(color, cut) %>%
group_by(color) %>%
mutate(prop = n / sum(n)) %>%
ggplot(mapping = aes(x = color, y = cut)) +
geom_tile(mapping = aes(fill = prop))
diamonds %>%
count(color, cut) %>%
group_by(cut) %>%
mutate(prop = n / sum(n)) %>%
ggplot(mapping = aes(x = color, y = cut)) +
geom_tile(mapping = aes(fill = prop))
These are the logical boundaries of proportions. This makes it possible to compare each cell to its actual value, and would improve comparisons across multiple plots.
The plot becomes diffcult to read by missing values
diamonds %>%
count(color, cut) %>%
ggplot(mapping = aes(y = color, x = cut)) +
geom_tile(mapping = aes(fill = n))
ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price))
ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price), alpha = 1 / 100)
ggplot(data = smaller) +
geom_bin2d(mapping = aes(x = carat, y = price))
# install.packages("hexbin")
ggplot(data = smaller) +
geom_hex(mapping = aes(x = carat, y = price))
## Warning: Computation failed in `stat_binhex()`.
## Caused by error in `compute_group()`:
## ! The package "hexbin" is required for `stat_bin_hex()`.
ggplot(data = smaller, mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))
ggplot(data = smaller, mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_number(carat, 20)))
Exercises 1. Instead of summarising the conditional distribution with a boxplot, you could use a frequency polygon. What do you need to consider when using cut_width() vs cut_number()? How does that impact a visualisation of the 2d distribution of carat and price?
Both cut_width() and cut_number() split a variable into groups. When using cut_width(), we need to choose the width, and the number of bins will be calculated automatically. When using cut_number(), we need to specify the number of bins, and the widths will be calculated automatically.Since there are very few diamonds larger than 2-carats, this is not as informative. However, using a width of 0.5 carats creates too many groups, and splitting at non-whole numbers is unappealing.
ggplot(
data = diamonds,
mapping = aes(color = cut_number(carat, 5), x = price)
) +
geom_freqpoly() +
labs(x = "Price", y = "Count", color = "Carat")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(
data = diamonds,
mapping = aes(color = cut_width(carat, 1, boundary = 0), x = price)
) +
geom_freqpoly() +
labs(x = "Price", y = "Count", color = "Carat")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
2.Plotted with a box plot with 10 bins with an equal number of observations, and the width determined by the number of observations.Plotted with a box plot with 10 equal-width bins of $2,000. The argument boundary = 0 ensures that first bin is $0–$2,000.
ggplot(diamonds, aes(x = cut_number(price, 10), y = carat)) +
geom_boxplot() +
coord_flip() +
xlab("Price")
ggplot(diamonds, aes(x = cut_width(price, 2000, boundary = 0), y = carat)) +
geom_boxplot(varwidth = TRUE) +
coord_flip() +
xlab("Price")
How does the price distribution of very large diamonds compare to small diamonds. Is it as you expect, or does it surprise you? The distribution of very large diamonds is more variable. I am not surprised, since I knew little about diamond prices. After the fact, it does not seem surprising (as many thing do). I would guess that this is due to the way in which diamonds are selected for retail sales. Suppose that someone selling a diamond only finds it profitable to sell it if some combination size, cut, clarity, and color are above a certain threshold. The smallest diamonds are only profitable to sell if they are exceptional in all the other factors (cut, clarity, and color), so the small diamonds sold have similar characteristics. However, larger diamonds may be profitable regardless of the values of the other factors. Thus we will observe large diamonds with a wider variety of cut, clarity, and color and thus more variability in prices.
Combine two of the techniques you’ve learned to visualize the combined distribution of cut, carat, and price.
ggplot(diamonds, aes(x = carat, y = price)) +
geom_hex() +
facet_wrap(~cut, ncol = 1)
## Warning: Computation failed in `stat_binhex()`.
## Computation failed in `stat_binhex()`.
## Computation failed in `stat_binhex()`.
## Computation failed in `stat_binhex()`.
## Computation failed in `stat_binhex()`.
## Caused by error in `compute_group()`:
## ! The package "hexbin" is required for `stat_bin_hex()`.
ggplot(diamonds, aes(x = cut_number(carat, 5), y = price, colour = cut)) +
geom_boxplot()
ggplot(data = diamonds) +
geom_point(mapping = aes(x = x, y = y)) +
coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))
In this case, there is a strong relationship between
x and
y . The outliers in this case are not extreme in either
x or
y . A binned plot would not reveal these outliers, and may lead us to
conclude that the largest value of
x was an outlier even though it appears to fit the bivariate pattern
well.
##Patterns and Models
ggplot(data = faithful) +
geom_point(mapping = aes(x = eruptions, y = waiting))
library(modelr)
mod <- lm(log(price) ~ log(carat), data = diamonds)
diamonds2 <- diamonds %>%
add_residuals(mod) %>%
mutate(resid = exp(resid))
ggplot(data = diamonds2) +
geom_point(mapping = aes(x = carat, y = resid))
ggplot(data = diamonds2) +
geom_boxplot(mapping = aes(x = cut, y = resid))
##ggplot 2 calls
ggplot(data = faithful, mapping = aes(x = eruptions)) +
geom_freqpoly(binwidth = 0.25)
ggplot(faithful, aes(eruptions)) +
geom_freqpoly(binwidth = 0.25)
diamonds %>%
count(cut, clarity) %>%
ggplot(aes(clarity, cut, fill = n)) +
geom_tile()