# Modern Dive Section 3

9/20/2017

library(nycflights13)
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(gapminder)

# The Magrittr Pipe

A pipe channels output from one R function into the first argument of the following function. For example suppose you want to calculate the mean value of the square roots of the first 10 integers. Here are different ways to do this.

x = 1:10

# 1 Create a temporary variable

x_sqroot = sqrt(x)
mean_sqroot = mean(x_sqroot)
mean_sqroot
## [1] 2.246828
# 2 Nest the function calls
mean_sqroot = mean(sqrt(x))
mean_sqroot
## [1] 2.246828
# 3 Use a pipe

mean_sqroot <- x %>% sqrt() %>% mean()
mean_sqroot
## [1] 2.246828
# You could also write the following, which is in keeping with the notion of piping.
x %>% sqrt() %>% mean() -> mean_sqroot
mean_sqroot
## [1] 2.246828

# The filter function

This is part of dplyr, which is the next topic in ModernDive, but we want it now. It creates a subset of the rows of its first argument, which must be a dataframe. The second argument is a logical expression which defines the rows allowed through by the filter.

For example, suppose we want a dataframe, big, consisting of the rows from the dataframe mtcars in which the value of disp is greater than 300.

big = filter(mtcars,disp>300)

# We could also write this as the following using a pipe.
mtcars %>% filter(disp>300) -> big2

# The base R syntax is more awkward.
big3 = mtcars[mtcars$disp > 300,] # Compare these. glimpse(big) ## Observations: 11 ## Variables: 11 ##$ mpg  <dbl> 18.7, 14.3, 10.4, 10.4, 14.7, 15.5, 15.2, 13.3, 19.2, 15....
## $cyl <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ##$ disp <dbl> 360, 360, 472, 460, 440, 318, 304, 350, 400, 351, 301
## $hp <dbl> 175, 245, 205, 215, 230, 150, 150, 245, 175, 264, 335 ##$ drat <dbl> 3.15, 3.21, 2.93, 3.00, 3.23, 2.76, 3.15, 3.73, 3.08, 4.2...
## $wt <dbl> 3.440, 3.570, 5.250, 5.424, 5.345, 3.520, 3.435, 3.840, 3... ##$ qsec <dbl> 17.02, 15.84, 17.98, 17.82, 17.42, 16.87, 17.30, 15.41, 1...
## $vs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ##$ am   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1
## $gear <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5 ##$ carb <dbl> 2, 4, 4, 4, 4, 2, 2, 4, 2, 4, 8
glimpse(big2)
## Observations: 11
## Variables: 11
## $mpg <dbl> 18.7, 14.3, 10.4, 10.4, 14.7, 15.5, 15.2, 13.3, 19.2, 15.... ##$ cyl  <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
## $disp <dbl> 360, 360, 472, 460, 440, 318, 304, 350, 400, 351, 301 ##$ hp   <dbl> 175, 245, 205, 215, 230, 150, 150, 245, 175, 264, 335
## $drat <dbl> 3.15, 3.21, 2.93, 3.00, 3.23, 2.76, 3.15, 3.73, 3.08, 4.2... ##$ wt   <dbl> 3.440, 3.570, 5.250, 5.424, 5.345, 3.520, 3.435, 3.840, 3...
## $qsec <dbl> 17.02, 15.84, 17.98, 17.82, 17.42, 16.87, 17.30, 15.41, 1... ##$ vs   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $am <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1 ##$ gear <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5
## $carb <dbl> 2, 4, 4, 4, 4, 2, 2, 4, 2, 4, 8 glimpse(big3) ## Observations: 11 ## Variables: 11 ##$ mpg  <dbl> 18.7, 14.3, 10.4, 10.4, 14.7, 15.5, 15.2, 13.3, 19.2, 15....
## $cyl <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ##$ disp <dbl> 360, 360, 472, 460, 440, 318, 304, 350, 400, 351, 301
## $hp <dbl> 175, 245, 205, 215, 230, 150, 150, 245, 175, 264, 335 ##$ drat <dbl> 3.15, 3.21, 2.93, 3.00, 3.23, 2.76, 3.15, 3.73, 3.08, 4.2...
## $wt <dbl> 3.440, 3.570, 5.250, 5.424, 5.345, 3.520, 3.435, 3.840, 3... ##$ qsec <dbl> 17.02, 15.84, 17.98, 17.82, 17.42, 16.87, 17.30, 15.41, 1...
## $vs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ##$ am   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1
## $gear <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5 ##$ carb <dbl> 2, 4, 4, 4, 4, 2, 2, 4, 2, 4, 8

# Replicate the Gapminder Graph

First use filter and a few other functions from dplyr to get the dataframe we want for plotting.

gapminder_2007 <- gapminder %>%
filter(year == 2007) %>%
select(-year) %>%
rename(
Country = country,
Continent = continent,
Life Expectancy = lifeExp,
Population = pop,
GDP per Capita = gdpPercap
)
glimpse(gapminder_2007)
## Observations: 142
## Variables: 5
## $Country <fctr> Afghanistan, Albania, Algeria, Angola, Arge... ##$ Continent         <fctr> Asia, Europe, Africa, Africa, Americas, Oce...
## $Life Expectancy <dbl> 43.828, 76.423, 72.301, 42.731, 75.320, 81.2... ##$ Population        <int> 31889923, 3600523, 33333216, 12420476, 40301...
## $GDP per Capita <dbl> 974.5803, 5937.0295, 6223.3675, 4797.2313, 1... # Compare to the original dataframe glimpse(gapminder) ## Observations: 1,704 ## Variables: 6 ##$ country   <fctr> Afghanistan, Afghanistan, Afghanistan, Afghanistan,...
## $continent <fctr> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asi... ##$ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992...
## $lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.8... ##$ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 1488...
## $gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 78... # Now do the graph ggplot(data=gapminder_2007, aes(x=GDP per Capita, y=Life Expectancy, size=Population, col=Continent)) + geom_point() # Replicate the AS Scatterplot all_alaska_flights <- flights %>% filter(carrier == "AS") ggplot(data = all_alaska_flights, aes(x = dep_delay, y = arr_delay)) + geom_point(alpha=.05) ## Warning: Removed 5 rows containing missing values (geom_point). What does the distribution of delay times look like? all_alaska_flights %>% ggplot(aes(x=dep_delay)) + geom_histogram() ## stat_bin() using bins = 30. Pick better value with binwidth. ## Warning: Removed 2 rows containing non-finite values (stat_bin). Cramped scale - do a log transformation by scaling the x-axis. Need to eliminate 0 and negative delays. all_alaska_flights %>% filter(dep_delay> 0) %>% ggplot(aes(x=dep_delay)) + geom_histogram() + scale_x_log10() ## stat_bin() using bins = 30. Pick better value with binwidth. Letâ€™s get the numerical details for the positive delay times. all_alaska_flights %>% filter(dep_delay> 0) %>% select(dep_delay) %>% summary() ## dep_delay ## Min. : 1.00 ## 1st Qu.: 4.00 ## Median : 11.50 ## Mean : 31.34 ## 3rd Qu.: 36.75 ## Max. :225.00 What about the negative delay times? all_alaska_flights %>% filter(dep_delay < 0) %>% select(dep_delay) %>% summary() ## dep_delay ## Min. :-21.000 ## 1st Qu.: -9.000 ## Median : -6.000 ## Mean : -6.441 ## 3rd Qu.: -4.000 ## Max. : -1.000 What fraction of flights had a zero dep_delay? mean(all_alaska_flights$dep_delay == 0,na.rm=TRUE)
## [1] 0.03932584

What about positive and negative delay times.

mean(all_alaska_flights$dep_delay > 0,na.rm=TRUE) ## [1] 0.3174157 mean(all_alaska_flights$dep_delay < 0,na.rm=TRUE)
## [1] 0.6432584