Usefule resources:
install.packages("tidyverse", repos = "http://cran.us.r-project.org", dependencies = TRUE)
library(tidyverse)
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_point(aes(color = drv), size = 3) + # when being inside aes, color changes by a variable
geom_point(shape = 21, color = "white", size = 3, stroke = 1.5) # with shapes 21-24, color applies to strokes
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_point(color = "white", size = 4) + # with the default shape, color applies to entire circles
geom_point(aes(color = drv)) # the bottom line becomes the top layer
Far better an approximate answer to the right question, which is often vague, than an exact answer to the wrong question, which can always be made precise. - John Tukey
What we do:
Why we do:
Terms:
R).A tendency of the values of a variable to change from one observation to another (Note: The book explains about measurement errors).
# for categorical variables
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut))
diamonds %>%
count(cut)
## # A tibble: 5 x 2
## cut n
## <ord> <int>
## 1 Fair 1610
## 2 Good 4906
## 3 Very Good 12082
## 4 Premium 13791
## 5 Ideal 21551
# for continuous variables
ggplot(data = diamonds) +
geom_histogram(mapping = aes(x = carat), binwidth = 0.5)
diamonds %>%
count(cut_width(carat, 0.5))
## Warning: package 'bindrcpp' was built under R version 3.5.2
## # A tibble: 11 x 2
## `cut_width(carat, 0.5)` n
## <fct> <int>
## 1 [-0.25,0.25] 785
## 2 (0.25,0.75] 29498
## 3 (0.75,1.25] 15977
## 4 (1.25,1.75] 5313
## 5 (1.75,2.25] 2002
## 6 (2.25,2.75] 322
## 7 (2.75,3.25] 32
## 8 (3.25,3.75] 5
## 9 (3.75,4.25] 4
## 10 (4.25,4.75] 1
## 11 (4.75,5.25] 1
# for a subset of your data, with a smaller bin size
smaller <- diamonds %>%
filter(carat <3)
ggplot(data = smaller, mapping = aes(x = carat)) +
geom_histogram(binwidth = 0.1)
# overlay multiple histograms
ggplot(data = smaller, mapping = aes(x = carat, color = cut)) +
geom_freqpoly(binwidth = 0.1)
Sample questions (still for a single variable):
ggplot(data = smaller, mapping = aes(x = carat)) +
geom_histogram(binwidth = 0.01)
# not execuated here: try yourself!
ggplot(data = faithful, mapping = aes(x = eruptions)) +
geom_histogram(binwidth = 0.25)
Outliers? Data entry errors?
# wide range of x with a normal binwidth
ggplot(diamonds) +
geom_histogram(mapping = aes(x = y), binwidth = 0.5)
# narrow the x range (not removing observations)
ggplot(diamonds) +
geom_histogram(mapping = aes(x = y), binwidth = 0.5) +
coord_cartesian(ylim = c(0, 50))
# further investigate unusual cases
unusual <- diamonds %>%
filter(y<3 | y > 20) %>%
arrange(y)
unusual
## # A tibble: 9 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 1 Very Good H VS2 63.3 53 5139 0 0 0
## 2 1.14 Fair G VS1 57.5 67 6381 0 0 0
## 3 1.56 Ideal G VS2 62.2 54 12800 0 0 0
## 4 1.2 Premium D VVS1 62.1 59 15686 0 0 0
## 5 2.25 Premium H SI2 62.8 59 18034 0 0 0
## 6 0.71 Good F SI2 64.1 60 2130 0 0 0
## 7 0.71 Good F SI2 64.1 60 2130 0 0 0
## 8 0.51 Ideal E VS1 61.8 55 2075 5.15 31.8 5.12
## 9 2 Premium H SI2 58.9 57 12210 8.09 58.9 8.06
Two ways of handling outliers:
diamonds2 <- diamonds %>%
filter(between(y, 3, 20))
nrow(diamonds2)
## [1] 53931
diamonds2 <- diamonds %>%
mutate(y = ifelse(y < 3 | y > 20, NA, y))
nrow(diamonds2)
## [1] 53940
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
geom_point()
## Warning: Removed 9 rows containing missing values (geom_point).
Describes the behavior between variables: The tendency for the values of two or more variables to vary together in a related way.
# for the distribution of a continuous variable broken down by a categorical variable
ggplot(data = diamonds, mapping = aes(x = price)) +
geom_freqpoly(mapping = aes(color = cut), binwidth = 500)
# density instead of counts
ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
geom_freqpoly(mapping = aes(color = cut), binwidth = 500)
For computed vaiables, geom_freqpoly
Components of a boxplot:
ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
geom_boxplot()
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
geom_boxplot()
#reorder the x variable
ggplot(data = mpg) +
geom_boxplot(
mapping = aes(
x = reorder(class, hwy, FUN = median), #reorder class by the median value for hwy
y = hwy
)
)
#flip the coordinates
ggplot(data = mpg) +
geom_boxplot(
mapping = aes(
x = reorder(class, hwy, FUN = median),
y = hwy
)
) +
coord_flip()
Count the number of observations for each combination
# the size of circles
ggplot(data = diamonds) +
geom_count(mapping = aes(x = cut, y = color))
# the color of squares
diamonds %>%
count(color, cut)
## # A tibble: 35 x 3
## color cut n
## <ord> <ord> <int>
## 1 D Fair 163
## 2 D Good 662
## 3 D Very Good 1513
## 4 D Premium 1603
## 5 D Ideal 2834
## 6 E Fair 224
## 7 E Good 933
## 8 E Very Good 2400
## 9 E Premium 2337
## 10 E Ideal 3903
## # ... with 25 more rows
diamonds %>%
count(color, cut) %>%
ggplot(mapping = aes(x = color, y = cut)) +
geom_tile(mapping = aes(fill = n))
# too crowded
ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price))
# change transparency
ggplot(data = diamonds) +
geom_point(
mapping = aes(x = carat, y = price),
alpha = 1/100
)
install.packages("hexbin", repos = "https://cloud.r-project.org", dependencies = TRUE)
library(hexbin)
ggplot(data = smaller) +
geom_bin2d(mapping = aes(x = carat, y = price))
ggplot(data = smaller) +
geom_hex(mapping = aes(x = carat, y = price))
# when x is not a categorical variable
ggplot(data = smaller, mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))
## change the width of each boxplot by the number of cases in each bin
ggplot(data = smaller, mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)), varwidth = TRUE)
# approximately the same number of points in each bin
ggplot(data = smaller, mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_number(carat, 20))) # note that the bin size changes to a number
From this chapter, you have FIVE questions for Weekly Homework #3.
Submit your R script file that includes your answers to individual questions (use #comment for discussion) on Canvas.
Note that online chapters differ from those of the print copy. Check the page numbers below.
Due at 11:59:00 PM (EST) on February 3rd Sunday.
No 1. How could you rescale the count dataset above to more clearly show the distribution of cut within colour, or colour within cut?
No 2. Use geom_tile() together with dplyr to explore how average flight delays vary by destination and month of year. What makes the plot difficult to read? How could you improve it?
No 3. Why is it slightly better to use aes(x = color, y = cut) rather than aes(x = cut, y = color) in the example above?
No 1. Instead of summarising the conditional distribution with a boxplot, you could use a frequency polygon. What do you need to consider when using cut_width() vs cut_number()? How does that impact a visualisation of the 2d distribution of carat and price?
No 4. Combine two of the techniques you’ve learned to visualise the combined distribution of cut, carat, and price.