Wk 4 Assignment: Data visualisation from the Hands-on Programming with R

title: "Untitled" author: "Suma Pendyala" date: "6/7/2020" output: html_document

Sections: 7.1.1 Prerequisites

library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.1     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ---------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Sections: 7.3.1 Visualising distributions

ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut))
plot of chunk unnamed-chunk-2
diamonds %>%
  count(cut)
## # A tibble: 5 x 2
##   cut           n
##   <ord>     <int>
## 1 Fair       1610
## 2 Good       4906
## 3 Very Good 12082
## 4 Premium   13791
## 5 Ideal     21551
#> # A tibble: 5 x 2
#>   cut           n
#>   <ord>     <int>
#> 1 Fair       1610
#> 2 Good       4906
#> 3 Very Good 12082
#> 4 Premium   13791
#> 5 Ideal     21551
ggplot(data = diamonds) +
  geom_histogram(mapping = aes(x = carat), binwidth = 0.5)
plot of chunk unnamed-chunk-4
diamonds %>%
  count(cut_width(carat, 0.5))
## # A tibble: 11 x 2
##    `cut_width(carat, 0.5)`     n
##    <fct>                   <int>
##  1 [-0.25,0.25]              785
##  2 (0.25,0.75]             29498
##  3 (0.75,1.25]             15977
##  4 (1.25,1.75]              5313
##  5 (1.75,2.25]              2002
##  6 (2.25,2.75]               322
##  7 (2.75,3.25]                32
##  8 (3.25,3.75]                 5
##  9 (3.75,4.25]                 4
## 10 (4.25,4.75]                 1
## 11 (4.75,5.25]                 1
#> # A tibble: 11 x 2
#>   `cut_width(carat, 0.5)`     n
#>   <fct>                   <int>
#> 1 [-0.25,0.25]              785
#> 2 (0.25,0.75]             29498
#> 3 (0.75,1.25]             15977
#> 4 (1.25,1.75]              5313
#> 5 (1.75,2.25]              2002
#> 6 (2.25,2.75]               322
#> # . with 5 more rows
smaller <- diamonds %>%
  filter(carat < 3)

ggplot(data = smaller, mapping = aes(x = carat)) +
  geom_histogram(binwidth = 0.1)
plot of chunk unnamed-chunk-6
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy), color = "blue")
plot of chunk unnamed-chunk-7
ggplot(data = smaller, mapping = aes(x = carat, colour = cut)) +
  geom_freqpoly(binwidth = 0.1)
plot of chunk unnamed-chunk-8
dim(mpg)
## [1] 234  11
nrow(mpg)
## [1] 234
ncol(mpg)
## [1] 11

7.3.2 Typical values

ggplot(data = smaller, mapping = aes(x = carat)) +
  geom_histogram(binwidth = 0.01)
plot of chunk unnamed-chunk-10
ggplot(data = faithful, mapping = aes(x = eruptions)) +
  geom_histogram(binwidth = 0.25)
plot of chunk unnamed-chunk-11

7.3.3 Unusual values

ggplot(diamonds) +
  geom_histogram(mapping = aes(x = y), binwidth = 0.5)
plot of chunk unnamed-chunk-12
ggplot(diamonds) +
  geom_histogram(mapping = aes(x = y), binwidth = 0.5) +
  coord_cartesian(ylim = c(0, 50))
plot of chunk unnamed-chunk-13
unusual <- diamonds %>%
  filter(y < 3 | y > 20) %>%
  select(price, x, y, z) %>%
  arrange(y)
unusual
## # A tibble: 9 x 4
##   price     x     y     z
##   <int> <dbl> <dbl> <dbl>
## 1  5139  0      0    0   
## 2  6381  0      0    0   
## 3 12800  0      0    0   
## 4 15686  0      0    0   
## 5 18034  0      0    0   
## 6  2130  0      0    0   
## 7  2130  0      0    0   
## 8  2075  5.15  31.8  5.12
## 9 12210  8.09  58.9  8.06
#> # A tibble: 9 x 4
#>   price     x     y     z
#>   <int> <dbl> <dbl> <dbl>
#> 1  5139  0      0    0   
#> 2  6381  0      0    0   
#> 3 12800  0      0    0   
#> 4 15686  0      0    0   
#> 5 18034  0      0    0   
#> 6  2130  0      0    0   
#> 7  2130  0      0    0   
#> 8  2075  5.15  31.8  5.12
#> 9 12210  8.09  58.9  8.06

7.4 Missing values

diamonds2 <- diamonds %>%
  filter(between(y, 3, 20))
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
  geom_point()
plot of chunk unnamed-chunk-16
#> Warning: Removed 9 rows containing missing values (geom_point).
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
  geom_point(na.rm = TRUE)
plot of chunk unnamed-chunk-17
nycflights13::flights %>%
  mutate(
    cancelled = is.na(dep_time),
    sched_hour = sched_dep_time %/% 100,
    sched_min = sched_dep_time %% 100,
    sched_dep_time = sched_hour + sched_min / 60
  ) %>%
  ggplot(mapping = aes(sched_dep_time)) +
    geom_freqpoly(mapping = aes(colour = cancelled), binwidth = 1/4)
plot of chunk unnamed-chunk-18

7.5 Covariation

ggplot(data = diamonds, mapping = aes(x = price)) +
  geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)
plot of chunk unnamed-chunk-19
ggplot(diamonds) +
  geom_bar(mapping = aes(x = cut))
plot of chunk unnamed-chunk-20
ggplot(data = mpg, mapping = aes(y = hwy, x = displ)) +
  geom_point() +
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
plot of chunk unnamed-chunk-21
ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
  geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)
plot of chunk unnamed-chunk-22
ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
  geom_boxplot()
plot of chunk unnamed-chunk-23
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
  geom_boxplot()
plot of chunk unnamed-chunk-24
ggplot(data = mpg) +
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))
plot of chunk unnamed-chunk-25
ggplot(data = mpg) +
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
  coord_flip()
plot of chunk unnamed-chunk-26

7.5.2 Two categorical variables

ggplot(data = diamonds) +
  geom_count(mapping = aes(x = cut, y = color))
plot of chunk unnamed-chunk-27
diamonds %>%
  count(color, cut)
## # A tibble: 35 x 3
##    color cut           n
##    <ord> <ord>     <int>
##  1 D     Fair        163
##  2 D     Good        662
##  3 D     Very Good  1513
##  4 D     Premium    1603
##  5 D     Ideal      2834
##  6 E     Fair        224
##  7 E     Good        933
##  8 E     Very Good  2400
##  9 E     Premium    2337
## 10 E     Ideal      3903
## # ... with 25 more rows
#> # A tibble: 35 x 3
#>   color cut           n
#>   <ord> <ord>     <int>
#> 1 D     Fair        163
#> 2 D     Good        662
#> 3 D     Very Good  1513
#> 4 D     Premium    1603
#> 5 D     Ideal      2834
#> 6 E     Fair        224
#> # . with 29 more rows
diamonds %>%
  count(color, cut) %>%
  ggplot(mapping = aes(x = color, y = cut)) +
    geom_tile(mapping = aes(fill = n))
plot of chunk unnamed-chunk-29

7.5.3 Two continuous variables

ggplot(data = diamonds) +
  geom_point(mapping = aes(x = carat, y = price))
plot of chunk unnamed-chunk-30
ggplot(data = diamonds) +
  geom_point(mapping = aes(x = carat, y = price), alpha = 1 / 100)
plot of chunk unnamed-chunk-31
ggplot(data = smaller) +
  geom_bin2d(mapping = aes(x = carat, y = price))
plot of chunk unnamed-chunk-32
# install.packages("hexbin")
ggplot(data = smaller) +
  geom_hex(mapping = aes(x = carat, y = price))
## Warning: Computation failed in `stat_binhex()`:
##   Package `hexbin` required for `stat_binhex`.
##   Please install and try again.
plot of chunk unnamed-chunk-32
ggplot(data = smaller, mapping = aes(x = carat, y = price)) +
  geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))
plot of chunk unnamed-chunk-33
ggplot(data = smaller, mapping = aes(x = carat, y = price)) +
  geom_boxplot(mapping = aes(group = cut_number(carat, 20)))
plot of chunk unnamed-chunk-34

7.6 Patterns and models

ggplot(data = faithful) +
  geom_point(mapping = aes(x = eruptions, y = waiting))
plot of chunk unnamed-chunk-35
library(modelr)

mod <- lm(log(price) ~ log(carat), data = diamonds)

diamonds2 <- diamonds %>%
  add_residuals(mod) %>%
  mutate(resid = exp(resid))

ggplot(data = diamonds2) +
  geom_point(mapping = aes(x = carat, y = resid))
plot of chunk unnamed-chunk-36

7.6 Patterns and models

ggplot(data = faithful, mapping = aes(x = eruptions)) +
  geom_freqpoly(binwidth = 0.25)
plot of chunk unnamed-chunk-37

7.7 ggplot2 calls

ggplot(data = faithful, mapping = aes(x = eruptions)) +
  geom_freqpoly(binwidth = 0.25)
plot of chunk unnamed-chunk-38
ggplot(faithful, aes(eruptions)) +
  geom_freqpoly(binwidth = 0.25)
plot of chunk unnamed-chunk-39
diamonds %>%
  count(cut, clarity) %>%
  ggplot(aes(clarity, cut, fill = n)) +
    geom_tile()
plot of chunk unnamed-chunk-40