#first we calculate the summary statistics of x, y and z
summary(select(diamonds, x, y, z))
## x y z
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 4.710 1st Qu.: 4.720 1st Qu.: 2.910
## Median : 5.700 Median : 5.710 Median : 3.530
## Mean : 5.731 Mean : 5.735 Mean : 3.539
## 3rd Qu.: 6.540 3rd Qu.: 6.540 3rd Qu.: 4.040
## Max. :10.740 Max. :58.900 Max. :31.800
#distribution of x
ggplot(diamonds) +
geom_histogram(mapping = aes(x = x), binwidth = 0.01)
#distribution of y
ggplot(diamonds) +
geom_histogram(mapping = aes(x = y), binwidth = 0.01)
#distribution of z
ggplot(diamonds) +
geom_histogram(mapping = aes(x = z), binwidth = 0.01)
# This is to remove the outliers to make the distribution easier to see.
filter(diamonds, x > 0, x < 10) %>%
ggplot() +
geom_histogram(mapping = aes(x = x), binwidth = 0.01) +
scale_x_continuous(breaks = 1:10)
filter(diamonds, y > 0, y < 10) %>%
ggplot() +
geom_histogram(mapping = aes(x = y), binwidth = 0.01) +
scale_x_continuous(breaks = 1:10)
filter(diamonds, z > 0, z < 10) %>%
ggplot() +
geom_histogram(mapping = aes(x = z), binwidth = 0.01) +
scale_x_continuous(breaks = 1:10)
#There are no diamonds with a price of $1,500 (between $1,455 and $1,545, including). There’s a bulge in the distribution around $750.
ggplot(filter(diamonds, price < 2500), aes(x = price)) +
geom_histogram(binwidth = 10, center = 0)
#The last digits of prices are often not uniformly distributed. They are often round, ending in 0 or 5 (for one-half). Another common pattern is ending in 99, as in $1999.
diamonds %>%
mutate(ending = price %% 10) %>%
ggplot(aes(x = ending)) +
geom_histogram(binwidth = 1, center = 0)
diamonds %>%
mutate(ending = price %% 100) %>%
ggplot(aes(x = ending)) +
geom_histogram(binwidth = 1)
diamonds %>%
mutate(ending = price %% 1000) %>%
filter(ending >= 500, ending <= 800) %>%
ggplot(aes(x = ending)) +
geom_histogram(binwidth = 1)
#There are more than 70 times as many 1 carat diamonds as 0.99 carat diamond.
diamonds %>%
filter(carat >= 0.99, carat <= 1) %>%
count(carat)
## # A tibble: 2 × 2
## carat n
## <dbl> <int>
## 1 0.99 23
## 2 1 1558
#The coord_cartesian() function zooms in on the area specified by the limits, after having calculated and drawn the geoms. Since the histogram bins have already been calculated, it is unaffected.
ggplot(diamonds) +
geom_histogram(mapping = aes(x = price)) +
coord_cartesian(xlim = c(100, 5000), ylim = c(0, 3000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#However, the xlim() and ylim() functions influence actions before the calculation of the stats related to the histogram. Thus, any values outside the x- and y-limits are dropped before calculating bin widths and counts. This can influence how the histogram looks.
ggplot(diamonds) +
geom_histogram(mapping = aes(x = price)) +
xlim(100, 5000) +
ylim(0, 3000)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 14714 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).
diamonds2 <- diamonds %>%
mutate(y = ifelse(y < 3 | y > 20, NA, y))
ggplot(diamonds2, aes(x = y)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).
diamonds %>%
mutate(cut = if_else(runif(n()) < 0.1, NA_character_, as.character(cut))) %>%
ggplot() +
geom_bar(mapping = aes(x = cut))
mean(c(0, 1, 2, NA), na.rm = TRUE)
## [1] 1
sum(c(0, 1, 2, NA), na.rm = TRUE)
## [1] 3
ggplot(data = mpg) +
geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
coord_flip()
library("ggstance")
##
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
##
## geom_errorbarh, GeomErrorbarh
ggplot(data = mpg) +
geom_boxploth(mapping = aes(y = reorder(class, hwy, FUN = median), x = hwy))
ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
geom_freqpoly(mapping = aes(color = cut), binwidth = 500)
ggplot(data = diamonds, mapping = aes(x = price)) +
geom_histogram() +
facet_wrap(~cut, ncol = 1, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
geom_violin() +
coord_flip()
library("ggbeeswarm")
ggplot(data = mpg) +
geom_quasirandom(mapping = aes(
x = reorder(class, hwy, FUN = median),
y = hwy
))
ggplot(data = mpg) +
geom_beeswarm(mapping = aes(
x = reorder(class, hwy, FUN = median),
y = hwy
))
#To clearly show the distribution of cut within color, calculate a new variable prop which is the proportion of each cut within a color. This is done using a grouped mutate.
diamonds %>%
count(color, cut) %>%
group_by(color) %>%
mutate(prop = n / sum(n)) %>%
ggplot(mapping = aes(x = color, y = cut)) +
geom_tile(mapping = aes(fill = prop))
# to scale by the distribution of color within cut,
diamonds %>%
count(color, cut) %>%
group_by(cut) %>%
mutate(prop = n / sum(n)) %>%
ggplot(mapping = aes(x = color, y = cut)) +
geom_tile(mapping = aes(fill = prop))
library("nycflights13")
flights %>%
group_by(month, dest) %>%
summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
ggplot(aes(x = factor(month), y = dest, fill = dep_delay)) +
geom_tile() +
labs(x = "Month", y = "Destination", fill = "Departure Delay")
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
* ##### To improve it, we could sort
destinations by a meaningful quantity and remove missing values.
flights %>%
group_by(month, dest) %>% # This gives us (month, dest) pairs
summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
group_by(dest) %>% # group all (month, dest) pairs by dest ..
filter(n() == 12) %>% # and only select those that have one entry per month
ungroup() %>%
mutate(dest = reorder(dest, dep_delay)) %>%
ggplot(aes(x = factor(month), y = dest, fill = dep_delay)) +
geom_tile() +
labs(x = "Month", y = "Destination", fill = "Departure Delay")
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
#Plotted with a box plot with 10 bins with an equal number of observations, and the width determined by the number of observations.
ggplot(diamonds, aes(x = cut_number(price, 10), y = carat)) +
geom_boxplot() +
coord_flip() +
xlab("Price")
# Plotted with a box plot with 10 equal-width bins of $2,000. The argument boundary = 0 ensures that first bin is $0–$2,000.
ggplot(diamonds, aes(x = cut_width(price, 2000, boundary = 0), y = carat)) +
geom_boxplot(varwidth = TRUE) +
coord_flip() +
xlab("Price")
ggplot(diamonds, aes(x = cut_number(carat, 5), y = price, colour = cut)) +
geom_boxplot()
ggplot(diamonds, aes(colour = cut_number(carat, 5), y = price, x = cut)) +
geom_boxplot()
ggplot(data = diamonds) +
geom_point(mapping = aes(x = x, y = y)) +
coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))