if(!require('lvplot')) {
install.packages('lvplot')
library('lvplot')
}
## Loading required package: lvplot
if(!require('ggbeeswarm')) {
install.packages('ggbeeswarm')
library('ggbeeswarm')
}
## Loading required package: ggbeeswarm
## Loading required package: ggplot2
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
library(nycflights13)
summary(select(diamonds, x, y,z))
## x y z
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 4.710 1st Qu.: 4.720 1st Qu.: 2.910
## Median : 5.700 Median : 5.710 Median : 3.530
## Mean : 5.731 Mean : 5.735 Mean : 3.539
## 3rd Qu.: 6.540 3rd Qu.: 6.540 3rd Qu.: 4.040
## Max. :10.740 Max. :58.900 Max. :31.800
From this dataset you can see X & Y > Z and there are outliers. Some diamonds are values at 0.
filter(diamonds, x == 0 | y == 0 | z == 0)
## # A tibble: 20 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 1 Premium G SI2 59.1 59 3142 6.55 6.48 0
## 2 1.01 Premium H I1 58.1 59 3167 6.66 6.6 0
## 3 1.1 Premium G SI2 63 59 3696 6.5 6.47 0
## 4 1.01 Premium F SI2 59.2 58 3837 6.5 6.47 0
## 5 1.5 Good G I1 64 61 4731 7.15 7.04 0
## 6 1.07 Ideal F SI2 61.6 56 4954 0 6.62 0
## 7 1 Very Good H VS2 63.3 53 5139 0 0 0
## 8 1.15 Ideal G VS2 59.2 56 5564 6.88 6.83 0
## 9 1.14 Fair G VS1 57.5 67 6381 0 0 0
## 10 2.18 Premium H SI2 59.4 61 12631 8.49 8.45 0
## 11 1.56 Ideal G VS2 62.2 54 12800 0 0 0
## 12 2.25 Premium I SI1 61.3 58 15397 8.52 8.42 0
## 13 1.2 Premium D VVS1 62.1 59 15686 0 0 0
## 14 2.2 Premium H SI1 61.2 59 17265 8.42 8.37 0
## 15 2.25 Premium H SI2 62.8 59 18034 0 0 0
## 16 2.02 Premium H VS2 62.7 53 18207 8.02 7.95 0
## 17 2.8 Good G SI2 63.8 58 18788 8.9 8.85 0
## 18 0.71 Good F SI2 64.1 60 2130 0 0 0
## 19 0.71 Good F SI2 64.1 60 2130 0 0 0
## 20 1.12 Premium G I1 60.4 59 2383 6.71 6.67 0
diamonds %>%
arrange(desc(y)) %>%
head()
## # A tibble: 6 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 2 Premium H SI2 58.9 57 12210 8.09 58.9 8.06
## 2 0.51 Ideal E VS1 61.8 55 2075 5.15 31.8 5.12
## 3 5.01 Fair J I1 65.5 59 18018 10.7 10.5 6.98
## 4 4.5 Fair J I1 65.8 58 18531 10.2 10.2 6.72
## 5 4.01 Premium I I1 61 61 15223 10.1 10.1 6.17
## 6 4.01 Premium J I1 62.5 62 15223 10.0 9.94 6.24
diamonds %>%
arrange(desc(z)) %>%
head()
## # A tibble: 6 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.51 Very Good E VS1 61.8 54.7 1970 5.12 5.15 31.8
## 2 2 Premium H SI2 58.9 57 12210 8.09 58.9 8.06
## 3 5.01 Fair J I1 65.5 59 18018 10.7 10.5 6.98
## 4 4.5 Fair J I1 65.8 58 18531 10.2 10.2 6.72
## 5 4.13 Fair H I1 64.8 61 17329 10 9.85 6.43
## 6 3.65 Fair H I1 67.1 53 11668 9.53 9.48 6.38
Closer look at the right skew of (x,y,z)
filter(diamonds, x > 0, x < 10) %>%
ggplot() +
geom_histogram(mapping = aes(x = x), binwidth = 0.01) +
scale_x_continuous(breaks = 1:10)
filter(diamonds, y > 0, y< 10) %>%
ggplot() +
geom_histogram(mapping = aes(x = y), binwidth= 0.01) +
scale_x_continuous(breaks = 1:10)
Summarize the data, comparing the values of (x,y,z) which describe the length, width, and depth.
summarise(diamonds, mean(x>y), mean(x>z), mean(y>z))
## # A tibble: 1 × 3
## `mean(x > y)` `mean(x > z)` `mean(y > z)`
## <dbl> <dbl> <dbl>
## 1 0.434 1.00 1.00
Depth is always smaller the length or width. The shallower the diamond is the more affect it has the reflection of light.
ggplot(filter(diamonds, price < 2500), aes(x = price))+
geom_histogram(binwidth= 10, center = 0)
ggplot(filter(diamonds), aes(x = price)) +
geom_histogram(binwidth = 100, center =0)
diamonds %>%
filter(carat >= 0.99, carat <= 1) %>%
count(carat)
## # A tibble: 2 × 2
## carat n
## <dbl> <int>
## 1 0.99 23
## 2 1 1558
diamonds %>%
filter(carat >= 0.9, carat <= 1.1) %>%
count(carat) %>%
print(n= Inf)
## # A tibble: 21 × 2
## carat n
## <dbl> <int>
## 1 0.9 1485
## 2 0.91 570
## 3 0.92 226
## 4 0.93 142
## 5 0.94 59
## 6 0.95 65
## 7 0.96 103
## 8 0.97 59
## 9 0.98 31
## 10 0.99 23
## 11 1 1558
## 12 1.01 2242
## 13 1.02 883
## 14 1.03 523
## 15 1.04 475
## 16 1.05 361
## 17 1.06 373
## 18 1.07 342
## 19 1.08 246
## 20 1.09 287
## 21 1.1 278
ggplot(diamonds) +
geom_histogram(mapping = aes(x = price)) +
coord_cartesian(xlim = c(100, 5000), ylim = c(0, 3000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Vs.
ggplot(diamonds) +
geom_histogram(mapping = aes(x = price)) +
xlim(100, 5000) +
ylim(0, 3000)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 14714 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).
Missing values are removed when the number of observations in each bin are calculated. See the warning message: Removed 9 rows containing non-finite values (stat_bin)
diamonds2 <- diamonds %>%
mutate(y = ifelse(y < 3 | y > 20, NA, y))
ggplot(diamonds2, aes(x = y)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).
diamonds %>%
mutate(cut = if_else(runif(n()) < 0.1, NA_character_, as.character(cut))) %>%
ggplot() +
geom_bar(mapping = aes(x = cut))
In a histogram, the x aesthetic variable needs to be numeric, and stat_bin() groups the observations by ranges into bins. Since the numeric value of the NA observations is unknown, they cannot be placed in a particular bin, and are dropped.In the geom_bar() function, NA is treated as another category. The x aesthetic in geom_bar() requires a discrete (categorical) variable, and missing values act like another category.
This option removes NA values from the vector prior to calculating the mean and sum.
mean(c(0, 1, 2, NA), na.rm = TRUE)
## [1] 1
sum(c(0, 1, 2, NA), na.rm = TRUE)
## [1] 3
In a histogram, the x aesthetic variable needs to be numeric, and stat_bin() groups the observations by ranges into bins. Since the numeric value of the NA observations is unknown, they cannot be placed in a particular bin, and are dropped.
nycflights13::flights %>%
mutate(
cancelled = is.na(dep_time),
sched_hour = sched_dep_time %/% 100,
sched_min = sched_dep_time %% 100,
sched_dep_time = sched_hour + sched_min / 60
) %>%
ggplot() +
geom_boxplot(mapping = aes(y = sched_dep_time, x = cancelled))
ggplot(data = diamonds) +
geom_boxplot(mapping = aes(x = reorder(cut, carat, FUN = median), y = carat)) +
coord_flip()
Like box-plots, the boxes of the letter-value plot correspond to quantiles. However, they incorporate far more quantiles than box-plots. They are useful for larger datasets because,
ggplot(diamonds, aes(x = cut, y = price)) +
geom_lv()
geom_lv shows the distrubtion of prices by cut a similar shape of tower of babble by the width.
ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
geom_freqpoly(mapping = aes(color = cut), binwidth = 500)
ggplot(data = diamonds, mapping = aes(x = price)) +
geom_histogram() +
facet_wrap(~cut, ncol = 1, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
geom_violin() +
coord_flip()
ggplot(data = mpg) +
geom_quasirandom(
mapping = aes(
x = reorder(class, hwy, FUN = median),
y = hwy
),
method = "frowney" )
ggplot(data = mpg) +
geom_beeswarm(mapping = aes(
x = reorder(class, hwy, FUN = median),
y = hwy
))
diamonds %>%
count(color, cut) %>%
group_by(color) %>%
mutate(prop = n / sum(n)) %>%
ggplot(mapping = aes(x = color, y = cut)) +
geom_tile(mapping = aes(fill = prop))
diamonds %>%
count(color, cut) %>%
group_by(cut) %>%
mutate(prop = n / sum(n)) %>%
ggplot(mapping = aes(x = color, y = cut)) +
geom_tile(mapping = aes(fill = prop))
flights %>%
group_by(month, dest) %>%
summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
ggplot(aes(x = factor(month), y = dest, fill = dep_delay)) +
geom_tile() +
labs(x = "Month", y = "Destination", fill = "Departure Delay")
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
flights %>%
group_by(month, dest) %>% # This gives us (month, dest) pairs
summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
group_by(dest) %>% # group all (month, dest) pairs by dest ..
filter(n() == 12) %>% # and only select those that have one entry per month
ungroup() %>%
mutate(dest = reorder(dest, dep_delay)) %>%
ggplot(aes(x = factor(month), y = dest, fill = dep_delay)) +
geom_tile() +
labs(x = "Month", y = "Destination", fill = "Departure Delay")
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
diamonds %>%
count(color, cut) %>%
ggplot(mapping = aes(y = color, x = cut)) +
geom_tile(mapping = aes(fill = n))
ggplot(
data = diamonds,
mapping = aes(color = cut_number(carat, 5), x = price)
) +
geom_freqpoly() +
labs(x = "Price", y = "Count", color = "Carat")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(
data = diamonds,
mapping = aes(color = cut_width(carat, 1, boundary = 0), x = price)
) +
geom_freqpoly() +
labs(x = "Price", y = "Count", color = "Carat")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(diamonds, aes(x = cut_number(price, 10), y = carat)) +
geom_boxplot() +
coord_flip() +
xlab("Price")
ggplot(diamonds, aes(x = cut_number(carat, 5), y = price, colour = cut)) +
geom_boxplot()
ggplot(data = diamonds) +
geom_point(mapping = aes(x = x, y = y)) +
coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))
In this case, there is a strong relationship between
x and y. The outliers in this case are not extreme in either x or y. A
binned plot would not reveal these outliers, and may lead us to conclude
that the largest value of x was an outlier even though it appears to fit
the bivariate pattern well.