Wk 4 Assignment: Data visualisation from the Hands-on Programming with R
title: "Untitled" author: "Suma Pendyala" date: "6/7/2020" output: html_document
Sections: 7.1.1 Prerequisites
library(tidyverse)
Sections: 7.3.1 Visualising distributions
ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut))

diamonds %>% count(cut)
## # A tibble: 5 x 2 ## cut n ## <ord> <int> ## 1 Fair 1610 ## 2 Good 4906 ## 3 Very Good 12082 ## 4 Premium 13791 ## 5 Ideal 21551
#> # A tibble: 5 x 2 #> cut n #> <ord> <int> #> 1 Fair 1610 #> 2 Good 4906 #> 3 Very Good 12082 #> 4 Premium 13791 #> 5 Ideal 21551
ggplot(data = diamonds) + geom_histogram(mapping = aes(x = carat), binwidth = 0.5)

diamonds %>% count(cut_width(carat, 0.5))
## # A tibble: 11 x 2 ## `cut_width(carat, 0.5)` n ## <fct> <int> ## 1 [-0.25,0.25] 785 ## 2 (0.25,0.75] 29498 ## 3 (0.75,1.25] 15977 ## 4 (1.25,1.75] 5313 ## 5 (1.75,2.25] 2002 ## 6 (2.25,2.75] 322 ## 7 (2.75,3.25] 32 ## 8 (3.25,3.75] 5 ## 9 (3.75,4.25] 4 ## 10 (4.25,4.75] 1 ## 11 (4.75,5.25] 1
#> # A tibble: 11 x 2 #> `cut_width(carat, 0.5)` n #> <fct> <int> #> 1 [-0.25,0.25] 785 #> 2 (0.25,0.75] 29498 #> 3 (0.75,1.25] 15977 #> 4 (1.25,1.75] 5313 #> 5 (1.75,2.25] 2002 #> 6 (2.25,2.75] 322 #> # . with 5 more rows
smaller <- diamonds %>% filter(carat < 3) ggplot(data = smaller, mapping = aes(x = carat)) + geom_histogram(binwidth = 0.1)

ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy), color = "blue")

ggplot(data = smaller, mapping = aes(x = carat, colour = cut)) + geom_freqpoly(binwidth = 0.1)

dim(mpg)
## [1] 234 11
nrow(mpg)
## [1] 234
ncol(mpg)
## [1] 11
7.3.2 Typical values
ggplot(data = smaller, mapping = aes(x = carat)) + geom_histogram(binwidth = 0.01)

ggplot(data = faithful, mapping = aes(x = eruptions)) + geom_histogram(binwidth = 0.25)

7.3.3 Unusual values
ggplot(diamonds) + geom_histogram(mapping = aes(x = y), binwidth = 0.5)

ggplot(diamonds) + geom_histogram(mapping = aes(x = y), binwidth = 0.5) + coord_cartesian(ylim = c(0, 50))

unusual <- diamonds %>% filter(y < 3 | y > 20) %>% select(price, x, y, z) %>% arrange(y) unusual
## # A tibble: 9 x 4 ## price x y z ## <int> <dbl> <dbl> <dbl> ## 1 5139 0 0 0 ## 2 6381 0 0 0 ## 3 12800 0 0 0 ## 4 15686 0 0 0 ## 5 18034 0 0 0 ## 6 2130 0 0 0 ## 7 2130 0 0 0 ## 8 2075 5.15 31.8 5.12 ## 9 12210 8.09 58.9 8.06
#> # A tibble: 9 x 4 #> price x y z #> <int> <dbl> <dbl> <dbl> #> 1 5139 0 0 0 #> 2 6381 0 0 0 #> 3 12800 0 0 0 #> 4 15686 0 0 0 #> 5 18034 0 0 0 #> 6 2130 0 0 0 #> 7 2130 0 0 0 #> 8 2075 5.15 31.8 5.12 #> 9 12210 8.09 58.9 8.06
7.4 Missing values
diamonds2 <- diamonds %>% filter(between(y, 3, 20))
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + geom_point()

#> Warning: Removed 9 rows containing missing values (geom_point).
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + geom_point(na.rm = TRUE)

nycflights13::flights %>% mutate( cancelled = is.na(dep_time), sched_hour = sched_dep_time %/% 100, sched_min = sched_dep_time %% 100, sched_dep_time = sched_hour + sched_min / 60 ) %>% ggplot(mapping = aes(sched_dep_time)) + geom_freqpoly(mapping = aes(colour = cancelled), binwidth = 1/4)

7.5 Covariation
ggplot(data = diamonds, mapping = aes(x = price)) + geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)

ggplot(diamonds) + geom_bar(mapping = aes(x = cut))

ggplot(data = mpg, mapping = aes(y = hwy, x = displ)) + geom_point() + geom_smooth(se = FALSE)

ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) + geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) + geom_boxplot()

ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + geom_boxplot()

ggplot(data = mpg) + geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))

ggplot(data = mpg) + geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) + coord_flip()

7.5.2 Two categorical variables
ggplot(data = diamonds) + geom_count(mapping = aes(x = cut, y = color))

diamonds %>% count(color, cut)
## # A tibble: 35 x 3 ## color cut n ## <ord> <ord> <int> ## 1 D Fair 163 ## 2 D Good 662 ## 3 D Very Good 1513 ## 4 D Premium 1603 ## 5 D Ideal 2834 ## 6 E Fair 224 ## 7 E Good 933 ## 8 E Very Good 2400 ## 9 E Premium 2337 ## 10 E Ideal 3903 ## # ... with 25 more rows
#> # A tibble: 35 x 3 #> color cut n #> <ord> <ord> <int> #> 1 D Fair 163 #> 2 D Good 662 #> 3 D Very Good 1513 #> 4 D Premium 1603 #> 5 D Ideal 2834 #> 6 E Fair 224 #> # . with 29 more rows
diamonds %>% count(color, cut) %>% ggplot(mapping = aes(x = color, y = cut)) + geom_tile(mapping = aes(fill = n))

7.5.3 Two continuous variables
ggplot(data = diamonds) + geom_point(mapping = aes(x = carat, y = price))

ggplot(data = diamonds) + geom_point(mapping = aes(x = carat, y = price), alpha = 1 / 100)

ggplot(data = smaller) + geom_bin2d(mapping = aes(x = carat, y = price))

# install.packages("hexbin") ggplot(data = smaller) + geom_hex(mapping = aes(x = carat, y = price))
## Warning: Computation failed in `stat_binhex()`: ## Package `hexbin` required for `stat_binhex`. ## Please install and try again.

ggplot(data = smaller, mapping = aes(x = carat, y = price)) + geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))

ggplot(data = smaller, mapping = aes(x = carat, y = price)) + geom_boxplot(mapping = aes(group = cut_number(carat, 20)))

7.6 Patterns and models
ggplot(data = faithful) + geom_point(mapping = aes(x = eruptions, y = waiting))

library(modelr) mod <- lm(log(price) ~ log(carat), data = diamonds) diamonds2 <- diamonds %>% add_residuals(mod) %>% mutate(resid = exp(resid)) ggplot(data = diamonds2) + geom_point(mapping = aes(x = carat, y = resid))

7.6 Patterns and models
ggplot(data = faithful, mapping = aes(x = eruptions)) + geom_freqpoly(binwidth = 0.25)

7.7 ggplot2 calls
ggplot(data = faithful, mapping = aes(x = eruptions)) + geom_freqpoly(binwidth = 0.25)

ggplot(faithful, aes(eruptions)) + geom_freqpoly(binwidth = 0.25)

diamonds %>% count(cut, clarity) %>% ggplot(aes(clarity, cut, fill = n)) + geom_tile()
