Modern Dive Section 3

Harold Nelson

9/20/2017

Load the Necessary Packages

library(nycflights13)
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(gapminder)

The Magrittr Pipe

A pipe channels output from one R function into the first argument of the following function. For example suppose you want to calculate the mean value of the square roots of the first 10 integers. Here are different ways to do this.

x = 1:10

# 1 Create a temporary variable

x_sqroot = sqrt(x)
mean_sqroot = mean(x_sqroot)
mean_sqroot
## [1] 2.246828
# 2 Nest the function calls
mean_sqroot = mean(sqrt(x))
mean_sqroot
## [1] 2.246828
# 3 Use a pipe

mean_sqroot <- x %>% sqrt() %>% mean() 
mean_sqroot
## [1] 2.246828
# You could also write the following, which is in keeping with the notion of piping.
x %>% sqrt() %>% mean() -> mean_sqroot
mean_sqroot
## [1] 2.246828

The filter function

This is part of dplyr, which is the next topic in ModernDive, but we want it now. It creates a subset of the rows of its first argument, which must be a dataframe. The second argument is a logical expression which defines the rows allowed through by the filter.

For example, suppose we want a dataframe, big, consisting of the rows from the dataframe mtcars in which the value of disp is greater than 300.

big = filter(mtcars,disp>300)

# We could also write this as the following using a pipe.
mtcars %>% filter(disp>300) -> big2

# The base R syntax is more awkward.
big3 = mtcars[mtcars$disp > 300,]

# Compare these.

glimpse(big)
## Observations: 11
## Variables: 11
## $ mpg  <dbl> 18.7, 14.3, 10.4, 10.4, 14.7, 15.5, 15.2, 13.3, 19.2, 15....
## $ cyl  <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
## $ disp <dbl> 360, 360, 472, 460, 440, 318, 304, 350, 400, 351, 301
## $ hp   <dbl> 175, 245, 205, 215, 230, 150, 150, 245, 175, 264, 335
## $ drat <dbl> 3.15, 3.21, 2.93, 3.00, 3.23, 2.76, 3.15, 3.73, 3.08, 4.2...
## $ wt   <dbl> 3.440, 3.570, 5.250, 5.424, 5.345, 3.520, 3.435, 3.840, 3...
## $ qsec <dbl> 17.02, 15.84, 17.98, 17.82, 17.42, 16.87, 17.30, 15.41, 1...
## $ vs   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ am   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1
## $ gear <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5
## $ carb <dbl> 2, 4, 4, 4, 4, 2, 2, 4, 2, 4, 8
glimpse(big2)
## Observations: 11
## Variables: 11
## $ mpg  <dbl> 18.7, 14.3, 10.4, 10.4, 14.7, 15.5, 15.2, 13.3, 19.2, 15....
## $ cyl  <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
## $ disp <dbl> 360, 360, 472, 460, 440, 318, 304, 350, 400, 351, 301
## $ hp   <dbl> 175, 245, 205, 215, 230, 150, 150, 245, 175, 264, 335
## $ drat <dbl> 3.15, 3.21, 2.93, 3.00, 3.23, 2.76, 3.15, 3.73, 3.08, 4.2...
## $ wt   <dbl> 3.440, 3.570, 5.250, 5.424, 5.345, 3.520, 3.435, 3.840, 3...
## $ qsec <dbl> 17.02, 15.84, 17.98, 17.82, 17.42, 16.87, 17.30, 15.41, 1...
## $ vs   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ am   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1
## $ gear <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5
## $ carb <dbl> 2, 4, 4, 4, 4, 2, 2, 4, 2, 4, 8
glimpse(big3)
## Observations: 11
## Variables: 11
## $ mpg  <dbl> 18.7, 14.3, 10.4, 10.4, 14.7, 15.5, 15.2, 13.3, 19.2, 15....
## $ cyl  <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
## $ disp <dbl> 360, 360, 472, 460, 440, 318, 304, 350, 400, 351, 301
## $ hp   <dbl> 175, 245, 205, 215, 230, 150, 150, 245, 175, 264, 335
## $ drat <dbl> 3.15, 3.21, 2.93, 3.00, 3.23, 2.76, 3.15, 3.73, 3.08, 4.2...
## $ wt   <dbl> 3.440, 3.570, 5.250, 5.424, 5.345, 3.520, 3.435, 3.840, 3...
## $ qsec <dbl> 17.02, 15.84, 17.98, 17.82, 17.42, 16.87, 17.30, 15.41, 1...
## $ vs   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ am   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1
## $ gear <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5
## $ carb <dbl> 2, 4, 4, 4, 4, 2, 2, 4, 2, 4, 8

Replicate the Gapminder Graph

First use filter and a few other functions from dplyr to get the dataframe we want for plotting.

gapminder_2007 <- gapminder %>% 
  filter(year == 2007) %>% 
  select(-year) %>% 
  rename(
    Country = country,
    Continent = continent,
    `Life Expectancy` = lifeExp,
    `Population` = pop,
    `GDP per Capita` = gdpPercap
  )
glimpse(gapminder_2007)
## Observations: 142
## Variables: 5
## $ Country           <fctr> Afghanistan, Albania, Algeria, Angola, Arge...
## $ Continent         <fctr> Asia, Europe, Africa, Africa, Americas, Oce...
## $ `Life Expectancy` <dbl> 43.828, 76.423, 72.301, 42.731, 75.320, 81.2...
## $ Population        <int> 31889923, 3600523, 33333216, 12420476, 40301...
## $ `GDP per Capita`  <dbl> 974.5803, 5937.0295, 6223.3675, 4797.2313, 1...
# Compare to the original dataframe
glimpse(gapminder)
## Observations: 1,704
## Variables: 6
## $ country   <fctr> Afghanistan, Afghanistan, Afghanistan, Afghanistan,...
## $ continent <fctr> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asi...
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992...
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.8...
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 1488...
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 78...

Now do the graph

ggplot(data=gapminder_2007, aes(x=`GDP per Capita`, y=`Life Expectancy`, size=Population, col=Continent)) +
  geom_point()

Replicate the AS Scatterplot

all_alaska_flights <- flights %>% 
  filter(carrier == "AS")
ggplot(data = all_alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_point(alpha=.05)
## Warning: Removed 5 rows containing missing values (geom_point).

What does the distribution of delay times look like?

all_alaska_flights %>% ggplot(aes(x=dep_delay)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).

Cramped scale - do a log transformation by scaling the x-axis. Need to eliminate 0 and negative delays.

all_alaska_flights %>%
  filter(dep_delay> 0) %>% 
  ggplot(aes(x=dep_delay)) + geom_histogram() +
  scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Let’s get the numerical details for the positive delay times.

all_alaska_flights %>%
  filter(dep_delay> 0) %>% 
  select(dep_delay) %>% 
  summary()
##    dep_delay     
##  Min.   :  1.00  
##  1st Qu.:  4.00  
##  Median : 11.50  
##  Mean   : 31.34  
##  3rd Qu.: 36.75  
##  Max.   :225.00

What about the negative delay times?

all_alaska_flights %>%
  filter(dep_delay < 0) %>% 
  select(dep_delay) %>% 
  summary()
##    dep_delay      
##  Min.   :-21.000  
##  1st Qu.: -9.000  
##  Median : -6.000  
##  Mean   : -6.441  
##  3rd Qu.: -4.000  
##  Max.   : -1.000

What fraction of flights had a zero dep_delay?

mean(all_alaska_flights$dep_delay == 0,na.rm=TRUE)
## [1] 0.03932584

What about positive and negative delay times.

mean(all_alaska_flights$dep_delay > 0,na.rm=TRUE)
## [1] 0.3174157
mean(all_alaska_flights$dep_delay < 0,na.rm=TRUE)
## [1] 0.6432584