pacman::p_load(dplyr, nycflights13, tidyverse, fivethirtyeight, ggplot2)

1. Base R – Motor Trend Car Dataset

data(mtcars)
head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
# Select columns
mtcars[, c("mpg", "cyl")]
##                      mpg cyl
## Mazda RX4           21.0   6
## Mazda RX4 Wag       21.0   6
## Datsun 710          22.8   4
## Hornet 4 Drive      21.4   6
## Hornet Sportabout   18.7   8
## Valiant             18.1   6
## Duster 360          14.3   8
## Merc 240D           24.4   4
## Merc 230            22.8   4
## Merc 280            19.2   6
## Merc 280C           17.8   6
## Merc 450SE          16.4   8
## Merc 450SL          17.3   8
## Merc 450SLC         15.2   8
## Cadillac Fleetwood  10.4   8
## Lincoln Continental 10.4   8
## Chrysler Imperial   14.7   8
## Fiat 128            32.4   4
## Honda Civic         30.4   4
## Toyota Corolla      33.9   4
## Toyota Corona       21.5   4
## Dodge Challenger    15.5   8
## AMC Javelin         15.2   8
## Camaro Z28          13.3   8
## Pontiac Firebird    19.2   8
## Fiat X1-9           27.3   4
## Porsche 914-2       26.0   4
## Lotus Europa        30.4   4
## Ford Pantera L      15.8   8
## Ferrari Dino        19.7   6
## Maserati Bora       15.0   8
## Volvo 142E          21.4   4
subset(mtcars, select=c("mpg", "cyl"))
##                      mpg cyl
## Mazda RX4           21.0   6
## Mazda RX4 Wag       21.0   6
## Datsun 710          22.8   4
## Hornet 4 Drive      21.4   6
## Hornet Sportabout   18.7   8
## Valiant             18.1   6
## Duster 360          14.3   8
## Merc 240D           24.4   4
## Merc 230            22.8   4
## Merc 280            19.2   6
## Merc 280C           17.8   6
## Merc 450SE          16.4   8
## Merc 450SL          17.3   8
## Merc 450SLC         15.2   8
## Cadillac Fleetwood  10.4   8
## Lincoln Continental 10.4   8
## Chrysler Imperial   14.7   8
## Fiat 128            32.4   4
## Honda Civic         30.4   4
## Toyota Corolla      33.9   4
## Toyota Corona       21.5   4
## Dodge Challenger    15.5   8
## AMC Javelin         15.2   8
## Camaro Z28          13.3   8
## Pontiac Firebird    19.2   8
## Fiat X1-9           27.3   4
## Porsche 914-2       26.0   4
## Lotus Europa        30.4   4
## Ford Pantera L      15.8   8
## Ferrari Dino        19.7   6
## Maserati Bora       15.0   8
## Volvo 142E          21.4   4
# Select columns containing "s"
mtcars[,grep("s", colnames(mtcars))]
##                      disp  qsec vs
## Mazda RX4           160.0 16.46  0
## Mazda RX4 Wag       160.0 17.02  0
## Datsun 710          108.0 18.61  1
## Hornet 4 Drive      258.0 19.44  1
## Hornet Sportabout   360.0 17.02  0
## Valiant             225.0 20.22  1
## Duster 360          360.0 15.84  0
## Merc 240D           146.7 20.00  1
## Merc 230            140.8 22.90  1
## Merc 280            167.6 18.30  1
## Merc 280C           167.6 18.90  1
## Merc 450SE          275.8 17.40  0
## Merc 450SL          275.8 17.60  0
## Merc 450SLC         275.8 18.00  0
## Cadillac Fleetwood  472.0 17.98  0
## Lincoln Continental 460.0 17.82  0
## Chrysler Imperial   440.0 17.42  0
## Fiat 128             78.7 19.47  1
## Honda Civic          75.7 18.52  1
## Toyota Corolla       71.1 19.90  1
## Toyota Corona       120.1 20.01  1
## Dodge Challenger    318.0 16.87  0
## AMC Javelin         304.0 17.30  0
## Camaro Z28          350.0 15.41  0
## Pontiac Firebird    400.0 17.05  0
## Fiat X1-9            79.0 18.90  1
## Porsche 914-2       120.3 16.70  0
## Lotus Europa         95.1 16.90  1
## Ford Pantera L      351.0 14.50  0
## Ferrari Dino        145.0 15.50  0
## Maserati Bora       301.0 14.60  0
## Volvo 142E          121.0 18.60  1
# Add a new column
mtcars$hpCyl <- mtcars$hp + mtcars$cyl
head(mtcars$hpCyl)
## [1] 116 116  97 116 183 111
# Filter rows
mtcars[(mtcars$cyl<=6 & mtcars$hp>500), ]
##  [1] mpg   cyl   disp  hp    drat  wt    qsec  vs    am    gear  carb  hpCyl
## <0 rows> (or 0-length row.names)
subset(mtcars, (cyl<=6 & mtcars$hp>500))
##  [1] mpg   cyl   disp  hp    drat  wt    qsec  vs    am    gear  carb  hpCyl
## <0 rows> (or 0-length row.names)
# Summarize for cyl = 4
cyl4 <- mtcars[which(mtcars$cyl==4),]
mean(cyl4$disp)
## [1] 105.1364
max(cyl4$hp)
## [1] 113

2. dplyr Verbs – Columns and Rows

# Column selection
select(mtcars, cyl, mpg)
##                     cyl  mpg
## Mazda RX4             6 21.0
## Mazda RX4 Wag         6 21.0
## Datsun 710            4 22.8
## Hornet 4 Drive        6 21.4
## Hornet Sportabout     8 18.7
## Valiant               6 18.1
## Duster 360            8 14.3
## Merc 240D             4 24.4
## Merc 230              4 22.8
## Merc 280              6 19.2
## Merc 280C             6 17.8
## Merc 450SE            8 16.4
## Merc 450SL            8 17.3
## Merc 450SLC           8 15.2
## Cadillac Fleetwood    8 10.4
## Lincoln Continental   8 10.4
## Chrysler Imperial     8 14.7
## Fiat 128              4 32.4
## Honda Civic           4 30.4
## Toyota Corolla        4 33.9
## Toyota Corona         4 21.5
## Dodge Challenger      8 15.5
## AMC Javelin           8 15.2
## Camaro Z28            8 13.3
## Pontiac Firebird      8 19.2
## Fiat X1-9             4 27.3
## Porsche 914-2         4 26.0
## Lotus Europa          4 30.4
## Ford Pantera L        8 15.8
## Ferrari Dino          6 19.7
## Maserati Bora         8 15.0
## Volvo 142E            4 21.4
select(mtcars, cyl:mpg)
##                     cyl  mpg
## Mazda RX4             6 21.0
## Mazda RX4 Wag         6 21.0
## Datsun 710            4 22.8
## Hornet 4 Drive        6 21.4
## Hornet Sportabout     8 18.7
## Valiant               6 18.1
## Duster 360            8 14.3
## Merc 240D             4 24.4
## Merc 230              4 22.8
## Merc 280              6 19.2
## Merc 280C             6 17.8
## Merc 450SE            8 16.4
## Merc 450SL            8 17.3
## Merc 450SLC           8 15.2
## Cadillac Fleetwood    8 10.4
## Lincoln Continental   8 10.4
## Chrysler Imperial     8 14.7
## Fiat 128              4 32.4
## Honda Civic           4 30.4
## Toyota Corolla        4 33.9
## Toyota Corona         4 21.5
## Dodge Challenger      8 15.5
## AMC Javelin           8 15.2
## Camaro Z28            8 13.3
## Pontiac Firebird      8 19.2
## Fiat X1-9             4 27.3
## Porsche 914-2         4 26.0
## Lotus Europa          4 30.4
## Ford Pantera L        8 15.8
## Ferrari Dino          6 19.7
## Maserati Bora         8 15.0
## Volvo 142E            4 21.4
select(mtcars, -(cyl:mpg))
##                      disp  hp drat    wt  qsec vs am gear carb hpCyl
## Mazda RX4           160.0 110 3.90 2.620 16.46  0  1    4    4   116
## Mazda RX4 Wag       160.0 110 3.90 2.875 17.02  0  1    4    4   116
## Datsun 710          108.0  93 3.85 2.320 18.61  1  1    4    1    97
## Hornet 4 Drive      258.0 110 3.08 3.215 19.44  1  0    3    1   116
## Hornet Sportabout   360.0 175 3.15 3.440 17.02  0  0    3    2   183
## Valiant             225.0 105 2.76 3.460 20.22  1  0    3    1   111
## Duster 360          360.0 245 3.21 3.570 15.84  0  0    3    4   253
## Merc 240D           146.7  62 3.69 3.190 20.00  1  0    4    2    66
## Merc 230            140.8  95 3.92 3.150 22.90  1  0    4    2    99
## Merc 280            167.6 123 3.92 3.440 18.30  1  0    4    4   129
## Merc 280C           167.6 123 3.92 3.440 18.90  1  0    4    4   129
## Merc 450SE          275.8 180 3.07 4.070 17.40  0  0    3    3   188
## Merc 450SL          275.8 180 3.07 3.730 17.60  0  0    3    3   188
## Merc 450SLC         275.8 180 3.07 3.780 18.00  0  0    3    3   188
## Cadillac Fleetwood  472.0 205 2.93 5.250 17.98  0  0    3    4   213
## Lincoln Continental 460.0 215 3.00 5.424 17.82  0  0    3    4   223
## Chrysler Imperial   440.0 230 3.23 5.345 17.42  0  0    3    4   238
## Fiat 128             78.7  66 4.08 2.200 19.47  1  1    4    1    70
## Honda Civic          75.7  52 4.93 1.615 18.52  1  1    4    2    56
## Toyota Corolla       71.1  65 4.22 1.835 19.90  1  1    4    1    69
## Toyota Corona       120.1  97 3.70 2.465 20.01  1  0    3    1   101
## Dodge Challenger    318.0 150 2.76 3.520 16.87  0  0    3    2   158
## AMC Javelin         304.0 150 3.15 3.435 17.30  0  0    3    2   158
## Camaro Z28          350.0 245 3.73 3.840 15.41  0  0    3    4   253
## Pontiac Firebird    400.0 175 3.08 3.845 17.05  0  0    3    2   183
## Fiat X1-9            79.0  66 4.08 1.935 18.90  1  1    4    1    70
## Porsche 914-2       120.3  91 4.43 2.140 16.70  0  1    5    2    95
## Lotus Europa         95.1 113 3.77 1.513 16.90  1  1    5    2   117
## Ford Pantera L      351.0 264 4.22 3.170 14.50  0  1    5    4   272
## Ferrari Dino        145.0 175 3.62 2.770 15.50  0  1    5    6   181
## Maserati Bora       301.0 335 3.54 3.570 14.60  0  1    5    8   343
## Volvo 142E          121.0 109 4.11 2.780 18.60  1  1    4    2   113
select(mtcars, contains("s"))
##                      disp  qsec vs
## Mazda RX4           160.0 16.46  0
## Mazda RX4 Wag       160.0 17.02  0
## Datsun 710          108.0 18.61  1
## Hornet 4 Drive      258.0 19.44  1
## Hornet Sportabout   360.0 17.02  0
## Valiant             225.0 20.22  1
## Duster 360          360.0 15.84  0
## Merc 240D           146.7 20.00  1
## Merc 230            140.8 22.90  1
## Merc 280            167.6 18.30  1
## Merc 280C           167.6 18.90  1
## Merc 450SE          275.8 17.40  0
## Merc 450SL          275.8 17.60  0
## Merc 450SLC         275.8 18.00  0
## Cadillac Fleetwood  472.0 17.98  0
## Lincoln Continental 460.0 17.82  0
## Chrysler Imperial   440.0 17.42  0
## Fiat 128             78.7 19.47  1
## Honda Civic          75.7 18.52  1
## Toyota Corolla       71.1 19.90  1
## Toyota Corona       120.1 20.01  1
## Dodge Challenger    318.0 16.87  0
## AMC Javelin         304.0 17.30  0
## Camaro Z28          350.0 15.41  0
## Pontiac Firebird    400.0 17.05  0
## Fiat X1-9            79.0 18.90  1
## Porsche 914-2       120.3 16.70  0
## Lotus Europa         95.1 16.90  1
## Ford Pantera L      351.0 14.50  0
## Ferrari Dino        145.0 15.50  0
## Maserati Bora       301.0 14.60  0
## Volvo 142E          121.0 18.60  1
select(mtcars, matches("vs"))
##                     vs
## Mazda RX4            0
## Mazda RX4 Wag        0
## Datsun 710           1
## Hornet 4 Drive       1
## Hornet Sportabout    0
## Valiant              1
## Duster 360           0
## Merc 240D            1
## Merc 230             1
## Merc 280             1
## Merc 280C            1
## Merc 450SE           0
## Merc 450SL           0
## Merc 450SLC          0
## Cadillac Fleetwood   0
## Lincoln Continental  0
## Chrysler Imperial    0
## Fiat 128             1
## Honda Civic          1
## Toyota Corolla       1
## Toyota Corona        1
## Dodge Challenger     0
## AMC Javelin          0
## Camaro Z28           0
## Pontiac Firebird     0
## Fiat X1-9            1
## Porsche 914-2        0
## Lotus Europa         1
## Ford Pantera L       0
## Ferrari Dino         0
## Maserati Bora        0
## Volvo 142E           1
select(mtcars, starts_with("c"))
##                     cyl carb
## Mazda RX4             6    4
## Mazda RX4 Wag         6    4
## Datsun 710            4    1
## Hornet 4 Drive        6    1
## Hornet Sportabout     8    2
## Valiant               6    1
## Duster 360            8    4
## Merc 240D             4    2
## Merc 230              4    2
## Merc 280              6    4
## Merc 280C             6    4
## Merc 450SE            8    3
## Merc 450SL            8    3
## Merc 450SLC           8    3
## Cadillac Fleetwood    8    4
## Lincoln Continental   8    4
## Chrysler Imperial     8    4
## Fiat 128              4    1
## Honda Civic           4    2
## Toyota Corolla        4    1
## Toyota Corona         4    1
## Dodge Challenger      8    2
## AMC Javelin           8    2
## Camaro Z28            8    4
## Pontiac Firebird      8    2
## Fiat X1-9             4    1
## Porsche 914-2         4    2
## Lotus Europa          4    2
## Ford Pantera L        8    4
## Ferrari Dino          6    6
## Maserati Bora         8    8
## Volvo 142E            4    2
select(mtcars, ends_with("p"))
##                      disp  hp
## Mazda RX4           160.0 110
## Mazda RX4 Wag       160.0 110
## Datsun 710          108.0  93
## Hornet 4 Drive      258.0 110
## Hornet Sportabout   360.0 175
## Valiant             225.0 105
## Duster 360          360.0 245
## Merc 240D           146.7  62
## Merc 230            140.8  95
## Merc 280            167.6 123
## Merc 280C           167.6 123
## Merc 450SE          275.8 180
## Merc 450SL          275.8 180
## Merc 450SLC         275.8 180
## Cadillac Fleetwood  472.0 205
## Lincoln Continental 460.0 215
## Chrysler Imperial   440.0 230
## Fiat 128             78.7  66
## Honda Civic          75.7  52
## Toyota Corolla       71.1  65
## Toyota Corona       120.1  97
## Dodge Challenger    318.0 150
## AMC Javelin         304.0 150
## Camaro Z28          350.0 245
## Pontiac Firebird    400.0 175
## Fiat X1-9            79.0  66
## Porsche 914-2       120.3  91
## Lotus Europa         95.1 113
## Ford Pantera L      351.0 264
## Ferrari Dino        145.0 175
## Maserati Bora       301.0 335
## Volvo 142E          121.0 109
# num_range example
newCol = mtcars
names(newCol) = c('x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11')
select(newCol, num_range("x", 2:5))
##                     x2    x3  x4   x5
## Mazda RX4            6 160.0 110 3.90
## Mazda RX4 Wag        6 160.0 110 3.90
## Datsun 710           4 108.0  93 3.85
## Hornet 4 Drive       6 258.0 110 3.08
## Hornet Sportabout    8 360.0 175 3.15
## Valiant              6 225.0 105 2.76
## Duster 360           8 360.0 245 3.21
## Merc 240D            4 146.7  62 3.69
## Merc 230             4 140.8  95 3.92
## Merc 280             6 167.6 123 3.92
## Merc 280C            6 167.6 123 3.92
## Merc 450SE           8 275.8 180 3.07
## Merc 450SL           8 275.8 180 3.07
## Merc 450SLC          8 275.8 180 3.07
## Cadillac Fleetwood   8 472.0 205 2.93
## Lincoln Continental  8 460.0 215 3.00
## Chrysler Imperial    8 440.0 230 3.23
## Fiat 128             4  78.7  66 4.08
## Honda Civic          4  75.7  52 4.93
## Toyota Corolla       4  71.1  65 4.22
## Toyota Corona        4 120.1  97 3.70
## Dodge Challenger     8 318.0 150 2.76
## AMC Javelin          8 304.0 150 3.15
## Camaro Z28           8 350.0 245 3.73
## Pontiac Firebird     8 400.0 175 3.08
## Fiat X1-9            4  79.0  66 4.08
## Porsche 914-2        4 120.3  91 4.43
## Lotus Europa         4  95.1 113 3.77
## Ford Pantera L       8 351.0 264 4.22
## Ferrari Dino         6 145.0 175 3.62
## Maserati Bora        8 301.0 335 3.54
## Volvo 142E           4 121.0 109 4.11
# Mutate example
mtcars <- mutate(mtcars, 
                 hpCyl = hp + cyl,
                 hpdetrend = hp - mean(hp, na.rm=TRUE)) 
head(select(mtcars, hp, cyl, hpCyl, hpdetrend))
##                    hp cyl hpCyl hpdetrend
## Mazda RX4         110   6   116  -36.6875
## Mazda RX4 Wag     110   6   116  -36.6875
## Datsun 710         93   4    97  -53.6875
## Hornet 4 Drive    110   6   116  -36.6875
## Hornet Sportabout 175   8   183   28.3125
## Valiant           105   6   111  -41.6875

3. dplyr Verbs – Filtering, Arranging, Summarizing

# Filter and arrange
filter(mtcars, cyl<8)
##                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb hpCyl
## Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4   116
## Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4   116
## Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1    97
## Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1   116
## Valiant        18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1   111
## Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2    66
## Merc 230       22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2    99
## Merc 280       19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4   129
## Merc 280C      17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4   129
## Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1    70
## Honda Civic    30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2    56
## Toyota Corolla 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1    69
## Toyota Corona  21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1   101
## Fiat X1-9      27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1    70
## Porsche 914-2  26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2    95
## Lotus Europa   30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2   117
## Ferrari Dino   19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6   181
## Volvo 142E     21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2   113
##                hpdetrend
## Mazda RX4       -36.6875
## Mazda RX4 Wag   -36.6875
## Datsun 710      -53.6875
## Hornet 4 Drive  -36.6875
## Valiant         -41.6875
## Merc 240D       -84.6875
## Merc 230        -51.6875
## Merc 280        -23.6875
## Merc 280C       -23.6875
## Fiat 128        -80.6875
## Honda Civic     -94.6875
## Toyota Corolla  -81.6875
## Toyota Corona   -49.6875
## Fiat X1-9       -80.6875
## Porsche 914-2   -55.6875
## Lotus Europa    -33.6875
## Ferrari Dino     28.3125
## Volvo 142E      -37.6875
filter(mtcars, cyl<8 & hp>500)
##  [1] mpg       cyl       disp      hp        drat      wt        qsec     
##  [8] vs        am        gear      carb      hpCyl     hpdetrend
## <0 rows> (or 0-length row.names)
arrange(mtcars, desc(cyl), hp)
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb hpCyl
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2   158
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2   158
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2   183
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2   183
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3   188
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3   188
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3   188
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4   213
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4   223
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4   238
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4   253
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4   253
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4   272
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8   343
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1   111
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4   116
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4   116
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1   116
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4   129
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4   129
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6   181
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2    56
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2    66
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1    69
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1    70
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1    70
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2    95
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1    97
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2    99
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1   101
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2   113
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2   117
##                     hpdetrend
## Dodge Challenger       3.3125
## AMC Javelin            3.3125
## Hornet Sportabout     28.3125
## Pontiac Firebird      28.3125
## Merc 450SE            33.3125
## Merc 450SL            33.3125
## Merc 450SLC           33.3125
## Cadillac Fleetwood    58.3125
## Lincoln Continental   68.3125
## Chrysler Imperial     83.3125
## Duster 360            98.3125
## Camaro Z28            98.3125
## Ford Pantera L       117.3125
## Maserati Bora        188.3125
## Valiant              -41.6875
## Mazda RX4            -36.6875
## Mazda RX4 Wag        -36.6875
## Hornet 4 Drive       -36.6875
## Merc 280             -23.6875
## Merc 280C            -23.6875
## Ferrari Dino          28.3125
## Honda Civic          -94.6875
## Merc 240D            -84.6875
## Toyota Corolla       -81.6875
## Fiat 128             -80.6875
## Fiat X1-9            -80.6875
## Porsche 914-2        -55.6875
## Datsun 710           -53.6875
## Merc 230             -51.6875
## Toyota Corona        -49.6875
## Volvo 142E           -37.6875
## Lotus Europa         -33.6875
# Summarize
disp_statistics <- summarize(mtcars, mean(disp),median(disp))
disp_statistics
##   mean(disp) median(disp)
## 1   230.7219        196.3
# Group by and summarize
cylinders <- group_by(mtcars, cyl)
summarize(cylinders, avg_disp = mean(disp), max_hp = max(hp))
## # A tibble: 3 × 3
##     cyl avg_disp max_hp
##   <dbl>    <dbl>  <dbl>
## 1     4     105.    113
## 2     6     183.    175
## 3     8     353.    335
# Piping example
group_by(mtcars, cyl) %>%  
  summarise(avg_disp = mean(disp), max_hp = max(hp)) 
## # A tibble: 3 × 3
##     cyl avg_disp max_hp
##   <dbl>    <dbl>  <dbl>
## 1     4     105.    113
## 2     6     183.    175
## 3     8     353.    335
group_by(mtcars, cyl) %>% 
  summarise(count = n()) %>% 
  arrange(desc(count)) 
## # A tibble: 3 × 2
##     cyl count
##   <dbl> <int>
## 1     8    14
## 2     4    11
## 3     6     7

4. Joins and Metadata with Flights Dataset

# Joins with airlines and airports
flights_joined <- flights %>% 
  inner_join(airlines, by = "carrier")

flights_with_airport_names <- flights %>% 
  inner_join(airports, by = c("dest" = "faa"))

named_dests <- flights %>%
  group_by(dest) %>%
  summarize(num_flights = n()) %>%
  arrange(desc(num_flights)) %>%
  inner_join(airports, by = c("dest" = "faa")) %>%
  rename(airport_name = name)
named_dests
## # A tibble: 101 × 9
##    dest  num_flights airport_name             lat    lon   alt    tz dst   tzone
##    <chr>       <int> <chr>                  <dbl>  <dbl> <dbl> <dbl> <chr> <chr>
##  1 ORD         17283 Chicago Ohare Intl      42.0  -87.9   668    -6 A     Amer…
##  2 ATL         17215 Hartsfield Jackson At…  33.6  -84.4  1026    -5 A     Amer…
##  3 LAX         16174 Los Angeles Intl        33.9 -118.    126    -8 A     Amer…
##  4 BOS         15508 General Edward Lawren…  42.4  -71.0    19    -5 A     Amer…
##  5 MCO         14082 Orlando Intl            28.4  -81.3    96    -5 A     Amer…
##  6 CLT         14064 Charlotte Douglas Intl  35.2  -80.9   748    -5 A     Amer…
##  7 SFO         13331 San Francisco Intl      37.6 -122.     13    -8 A     Amer…
##  8 FLL         12055 Fort Lauderdale Holly…  26.1  -80.2     9    -5 A     Amer…
##  9 MIA         11728 Miami Intl              25.8  -80.3     8    -5 A     Amer…
## 10 DCA          9705 Ronald Reagan Washing…  38.9  -77.0    15    -5 A     Amer…
## # ℹ 91 more rows
flights_weather_joined <- flights %>%
  inner_join(weather, by = c("year", "month", "day", "hour", "origin"))

joined_flights <- flights %>% 
  inner_join(airlines, by = "carrier")

5. Tidy Data Examples

# Drinks dataset
drinks_smaller <- drinks %>% 
  filter(country %in% c("USA", "China", "Italy", "Saudi Arabia")) %>% 
  select(-total_litres_of_pure_alcohol) %>% 
  rename(beer = beer_servings, spirit = spirit_servings, wine = wine_servings)
drinks_smaller
## # A tibble: 4 × 4
##   country       beer spirit  wine
##   <chr>        <int>  <int> <int>
## 1 China           79    192     8
## 2 Italy           85     42   237
## 3 Saudi Arabia     0      5     0
## 4 USA            249    158    84
drinks_smaller_tidy <- drinks_smaller %>% 
  pivot_longer(names_to = "type", 
               values_to = "servings", 
               cols = -country)
drinks_smaller_tidy
## # A tibble: 12 × 3
##    country      type   servings
##    <chr>        <chr>     <int>
##  1 China        beer         79
##  2 China        spirit      192
##  3 China        wine          8
##  4 Italy        beer         85
##  5 Italy        spirit       42
##  6 Italy        wine        237
##  7 Saudi Arabia beer          0
##  8 Saudi Arabia spirit        5
##  9 Saudi Arabia wine          0
## 10 USA          beer        249
## 11 USA          spirit      158
## 12 USA          wine         84
drinks_smaller %>% 
  pivot_longer(names_to = "type", 
               values_to = "servings", 
               cols = c(beer, spirit, wine))
## # A tibble: 12 × 3
##    country      type   servings
##    <chr>        <chr>     <int>
##  1 China        beer         79
##  2 China        spirit      192
##  3 China        wine          8
##  4 Italy        beer         85
##  5 Italy        spirit       42
##  6 Italy        wine        237
##  7 Saudi Arabia beer          0
##  8 Saudi Arabia spirit        5
##  9 Saudi Arabia wine          0
## 10 USA          beer        249
## 11 USA          spirit      158
## 12 USA          wine         84
ggplot(drinks_smaller_tidy, aes(x = country, y = servings, fill = type)) + 
  geom_col(position = "dodge")

drinks_smaller_tidy %>% 
  pivot_wider(
    names_from = type,
    values_from = servings
)
## # A tibble: 4 × 4
##   country       beer spirit  wine
##   <chr>        <int>  <int> <int>
## 1 China           79    192     8
## 2 Italy           85     42   237
## 3 Saudi Arabia     0      5     0
## 4 USA            249    158    84

6. Airline Safety Example

airline_safety_smaller <- airline_safety %>% 
  select(airline, starts_with("fatalities"))
airline_safety_smaller
## # A tibble: 56 × 3
##    airline               fatalities_85_99 fatalities_00_14
##    <chr>                            <int>            <int>
##  1 Aer Lingus                           0                0
##  2 Aeroflot                           128               88
##  3 Aerolineas Argentinas                0                0
##  4 Aeromexico                          64                0
##  5 Air Canada                           0                0
##  6 Air France                          79              337
##  7 Air India                          329              158
##  8 Air New Zealand                      0                7
##  9 Alaska Airlines                      0               88
## 10 Alitalia                            50                0
## # ℹ 46 more rows
airline_safety_smaller_tidy <- airline_safety_smaller %>%
  pivot_longer(names_to = "fatilities_years",
               values_to = "count",
    cols = -airline
  )
airline_safety_smaller_tidy
## # A tibble: 112 × 3
##    airline               fatilities_years count
##    <chr>                 <chr>            <int>
##  1 Aer Lingus            fatalities_85_99     0
##  2 Aer Lingus            fatalities_00_14     0
##  3 Aeroflot              fatalities_85_99   128
##  4 Aeroflot              fatalities_00_14    88
##  5 Aerolineas Argentinas fatalities_85_99     0
##  6 Aerolineas Argentinas fatalities_00_14     0
##  7 Aeromexico            fatalities_85_99    64
##  8 Aeromexico            fatalities_00_14     0
##  9 Air Canada            fatalities_85_99     0
## 10 Air Canada            fatalities_00_14     0
## # ℹ 102 more rows

Q1

glimpse(nycflights13::flights)
## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
summary(nycflights13::flights)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 106  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 906  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1359  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349   Mean   :1344  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##                                                  NA's   :8255                 
##    dep_delay          arr_time    sched_arr_time   arr_delay       
##  Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
##  1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124   1st Qu.: -17.000  
##  Median :  -2.00   Median :1535   Median :1556   Median :  -5.000  
##  Mean   :  12.64   Mean   :1502   Mean   :1536   Mean   :   6.895  
##  3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945   3rd Qu.:  14.000  
##  Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000  
##  NA's   :8255      NA's   :8713                  NA's   :9430      
##    carrier              flight       tailnum             origin         
##  Length:336776      Min.   :   1   Length:336776      Length:336776     
##  Class :character   1st Qu.: 553   Class :character   Class :character  
##  Mode  :character   Median :1496   Mode  :character   Mode  :character  
##                     Mean   :1972                                        
##                     3rd Qu.:3465                                        
##                     Max.   :8500                                        
##                                                                         
##      dest              air_time        distance         hour      
##  Length:336776      Min.   : 20.0   Min.   :  17   Min.   : 1.00  
##  Class :character   1st Qu.: 82.0   1st Qu.: 502   1st Qu.: 9.00  
##  Mode  :character   Median :129.0   Median : 872   Median :13.00  
##                     Mean   :150.7   Mean   :1040   Mean   :13.18  
##                     3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
##                     Max.   :695.0   Max.   :4983   Max.   :23.00  
##                     NA's   :9430                                  
##      minute        time_hour                     
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00.00  
##  1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00.00  
##  Median :29.00   Median :2013-07-03 10:00:00.00  
##  Mean   :26.23   Mean   :2013-07-03 05:22:54.64  
##  3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00.00  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00.00  
## 

Q2

sortf <- dplyr::arrange(flights, dplyr::desc(dep_delay))
dplyr::select(sortf, carrier, flight, tailnum, dplyr::everything()) |> head()
## # A tibble: 6 × 19
##   carrier flight tailnum  year month   day dep_time sched_dep_time dep_delay
##   <chr>    <int> <chr>   <int> <int> <int>    <int>          <int>     <dbl>
## 1 HA          51 N384HA   2013     1     9      641            900      1301
## 2 MQ        3535 N504MQ   2013     6    15     1432           1935      1137
## 3 MQ        3695 N517MQ   2013     1    10     1121           1635      1126
## 4 AA         177 N338AA   2013     9    20     1139           1845      1014
## 5 MQ        3075 N665MQ   2013     7    22      845           1600      1005
## 6 DL        2391 N959DL   2013     4    10     1100           1900       960
## # ℹ 10 more variables: arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
maxdep    <- max(flights$dep_delay, na.rm = TRUE)
maxdep_id <- which(flights$dep_delay == maxdep)
flights[maxdep_id, 10:12]  # shows carrier/flight/tailnum columns region
## # A tibble: 1 × 3
##   carrier flight tailnum
##   <chr>    <int> <chr>  
## 1 HA          51 N384HA

Q3

flights |>
  dplyr::group_by(year, month, day) |>
  dplyr::summarise(mean = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # ℹ 355 more rows
not_cancelled <- flights |>
  dplyr::filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled <- flights |>
  dplyr::filter(!is.na(dep_delay))

not_cancelled |>
  dplyr::group_by(year, month, day) |>
  dplyr::summarise(mean = mean(dep_delay))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # ℹ 355 more rows

Q4

not_cancelled |>
  dplyr::group_by(tailnum) |>
  dplyr::summarise(avg_arr_delay = mean(arr_delay, na.rm = TRUE), .groups = "drop") |>
  dplyr::arrange(avg_arr_delay) |>
  head(10)
## # A tibble: 10 × 2
##    tailnum avg_arr_delay
##    <chr>           <dbl>
##  1 N560AS          -53  
##  2 N315AS          -51  
##  3 N517AS          -40.5
##  4 N592AS          -35.5
##  5 N7AYAA          -35  
##  6 N585AS          -34.5
##  7 N915DN          -34  
##  8 N512AS          -31.5
##  9 N594AS          -31.3
## 10 N564AS          -31

Q5

not_cancelled |>
  dplyr::group_by(year, month, day) |>
  dplyr::summarise(first = min(dep_time, na.rm = TRUE),
                   last  = max(dep_time, na.rm = TRUE),
                   .groups = "drop") |>
  dplyr::arrange(dplyr::desc(last)) |>
  head(10)
## # A tibble: 10 × 5
##     year month   day first  last
##    <int> <int> <int> <int> <int>
##  1  2013     2     7    27  2400
##  2  2013     2    11     1  2400
##  3  2013     3    15    11  2400
##  4  2013     3    22    37  2400
##  5  2013     3    25    13  2400
##  6  2013     4     2     9  2400
##  7  2013     4     4    14  2400
##  8  2013     4    20     7  2400
##  9  2013     5    21   110  2400
## 10  2013     6    17     2  2400

Q6

not_cancelled |>
  dplyr::mutate(long_delay = dep_delay > 60) |>
  dplyr::group_by(month) |>
  dplyr::summarise(proportion = mean(long_delay, na.rm = TRUE),
                   .groups = "drop") |>
  dplyr::arrange(dplyr::desc(proportion))
## # A tibble: 12 × 2
##    month proportion
##    <int>      <dbl>
##  1     7     0.134 
##  2     6     0.128 
##  3    12     0.0942
##  4     4     0.0916
##  5     3     0.0837
##  6     5     0.0818
##  7     8     0.0796
##  8     2     0.0698
##  9     1     0.0688
## 10     9     0.0490
## 11    10     0.0469
## 12    11     0.0402

Q7

flights |>
  dplyr::group_by(dest) |>
  dplyr::summarise(num_carriers = dplyr::n_distinct(carrier), .groups = "drop") |>
  dplyr::arrange(dplyr::desc(num_carriers)) |>
  head(10)
## # A tibble: 10 × 2
##    dest  num_carriers
##    <chr>        <int>
##  1 ATL              7
##  2 BOS              7
##  3 CLT              7
##  4 ORD              7
##  5 TPA              7
##  6 AUS              6
##  7 DCA              6
##  8 DTW              6
##  9 IAD              6
## 10 MSP              6

Q8

# 1) group
by_dest <- dplyr::group_by(flights, dest)

# 2) summarise
delay <- dplyr::summarise(by_dest,
                          count = dplyr::n(),
                          dist  = mean(distance, na.rm = TRUE),
                          delay = mean(arr_delay,  na.rm = TRUE))

# 3) filter
delay <- dplyr::filter(delay, count > 20, dest != "HNL")

# 4) plot
ggplot2::ggplot(delay, ggplot2::aes(x = dist, y = delay)) +
  ggplot2::geom_point(ggplot2::aes(size = count), alpha = 1/3) +
  ggplot2::geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# Q9

delays <- flights |>
  dplyr::group_by(dest) |>
  dplyr::summarise(count = dplyr::n(),
                   dist  = mean(distance, na.rm = TRUE),
                   delay = mean(arr_delay,  na.rm = TRUE),
                   .groups = "drop") |>
  dplyr::filter(count > 20, dest != "HNL")

head(delays)
## # A tibble: 6 × 4
##   dest  count  dist delay
##   <chr> <int> <dbl> <dbl>
## 1 ABQ     254 1826   4.38
## 2 ACK     265  199   4.85
## 3 ALB     439  143  14.4 
## 4 ATL   17215  757. 11.3 
## 5 AUS    2439 1514.  6.02
## 6 AVL     275  584.  8.00