pacman::p_load(dplyr, nycflights13, tidyverse, fivethirtyeight, ggplot2)
data(mtcars)
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
# Select columns
mtcars[, c("mpg", "cyl")]
## mpg cyl
## Mazda RX4 21.0 6
## Mazda RX4 Wag 21.0 6
## Datsun 710 22.8 4
## Hornet 4 Drive 21.4 6
## Hornet Sportabout 18.7 8
## Valiant 18.1 6
## Duster 360 14.3 8
## Merc 240D 24.4 4
## Merc 230 22.8 4
## Merc 280 19.2 6
## Merc 280C 17.8 6
## Merc 450SE 16.4 8
## Merc 450SL 17.3 8
## Merc 450SLC 15.2 8
## Cadillac Fleetwood 10.4 8
## Lincoln Continental 10.4 8
## Chrysler Imperial 14.7 8
## Fiat 128 32.4 4
## Honda Civic 30.4 4
## Toyota Corolla 33.9 4
## Toyota Corona 21.5 4
## Dodge Challenger 15.5 8
## AMC Javelin 15.2 8
## Camaro Z28 13.3 8
## Pontiac Firebird 19.2 8
## Fiat X1-9 27.3 4
## Porsche 914-2 26.0 4
## Lotus Europa 30.4 4
## Ford Pantera L 15.8 8
## Ferrari Dino 19.7 6
## Maserati Bora 15.0 8
## Volvo 142E 21.4 4
subset(mtcars, select=c("mpg", "cyl"))
## mpg cyl
## Mazda RX4 21.0 6
## Mazda RX4 Wag 21.0 6
## Datsun 710 22.8 4
## Hornet 4 Drive 21.4 6
## Hornet Sportabout 18.7 8
## Valiant 18.1 6
## Duster 360 14.3 8
## Merc 240D 24.4 4
## Merc 230 22.8 4
## Merc 280 19.2 6
## Merc 280C 17.8 6
## Merc 450SE 16.4 8
## Merc 450SL 17.3 8
## Merc 450SLC 15.2 8
## Cadillac Fleetwood 10.4 8
## Lincoln Continental 10.4 8
## Chrysler Imperial 14.7 8
## Fiat 128 32.4 4
## Honda Civic 30.4 4
## Toyota Corolla 33.9 4
## Toyota Corona 21.5 4
## Dodge Challenger 15.5 8
## AMC Javelin 15.2 8
## Camaro Z28 13.3 8
## Pontiac Firebird 19.2 8
## Fiat X1-9 27.3 4
## Porsche 914-2 26.0 4
## Lotus Europa 30.4 4
## Ford Pantera L 15.8 8
## Ferrari Dino 19.7 6
## Maserati Bora 15.0 8
## Volvo 142E 21.4 4
# Select columns containing "s"
mtcars[,grep("s", colnames(mtcars))]
## disp qsec vs
## Mazda RX4 160.0 16.46 0
## Mazda RX4 Wag 160.0 17.02 0
## Datsun 710 108.0 18.61 1
## Hornet 4 Drive 258.0 19.44 1
## Hornet Sportabout 360.0 17.02 0
## Valiant 225.0 20.22 1
## Duster 360 360.0 15.84 0
## Merc 240D 146.7 20.00 1
## Merc 230 140.8 22.90 1
## Merc 280 167.6 18.30 1
## Merc 280C 167.6 18.90 1
## Merc 450SE 275.8 17.40 0
## Merc 450SL 275.8 17.60 0
## Merc 450SLC 275.8 18.00 0
## Cadillac Fleetwood 472.0 17.98 0
## Lincoln Continental 460.0 17.82 0
## Chrysler Imperial 440.0 17.42 0
## Fiat 128 78.7 19.47 1
## Honda Civic 75.7 18.52 1
## Toyota Corolla 71.1 19.90 1
## Toyota Corona 120.1 20.01 1
## Dodge Challenger 318.0 16.87 0
## AMC Javelin 304.0 17.30 0
## Camaro Z28 350.0 15.41 0
## Pontiac Firebird 400.0 17.05 0
## Fiat X1-9 79.0 18.90 1
## Porsche 914-2 120.3 16.70 0
## Lotus Europa 95.1 16.90 1
## Ford Pantera L 351.0 14.50 0
## Ferrari Dino 145.0 15.50 0
## Maserati Bora 301.0 14.60 0
## Volvo 142E 121.0 18.60 1
# Add a new column
mtcars$hpCyl <- mtcars$hp + mtcars$cyl
head(mtcars$hpCyl)
## [1] 116 116 97 116 183 111
# Filter rows
mtcars[(mtcars$cyl<=6 & mtcars$hp>500), ]
## [1] mpg cyl disp hp drat wt qsec vs am gear carb hpCyl
## <0 rows> (or 0-length row.names)
subset(mtcars, (cyl<=6 & mtcars$hp>500))
## [1] mpg cyl disp hp drat wt qsec vs am gear carb hpCyl
## <0 rows> (or 0-length row.names)
# Summarize for cyl = 4
cyl4 <- mtcars[which(mtcars$cyl==4),]
mean(cyl4$disp)
## [1] 105.1364
max(cyl4$hp)
## [1] 113
# Column selection
select(mtcars, cyl, mpg)
## cyl mpg
## Mazda RX4 6 21.0
## Mazda RX4 Wag 6 21.0
## Datsun 710 4 22.8
## Hornet 4 Drive 6 21.4
## Hornet Sportabout 8 18.7
## Valiant 6 18.1
## Duster 360 8 14.3
## Merc 240D 4 24.4
## Merc 230 4 22.8
## Merc 280 6 19.2
## Merc 280C 6 17.8
## Merc 450SE 8 16.4
## Merc 450SL 8 17.3
## Merc 450SLC 8 15.2
## Cadillac Fleetwood 8 10.4
## Lincoln Continental 8 10.4
## Chrysler Imperial 8 14.7
## Fiat 128 4 32.4
## Honda Civic 4 30.4
## Toyota Corolla 4 33.9
## Toyota Corona 4 21.5
## Dodge Challenger 8 15.5
## AMC Javelin 8 15.2
## Camaro Z28 8 13.3
## Pontiac Firebird 8 19.2
## Fiat X1-9 4 27.3
## Porsche 914-2 4 26.0
## Lotus Europa 4 30.4
## Ford Pantera L 8 15.8
## Ferrari Dino 6 19.7
## Maserati Bora 8 15.0
## Volvo 142E 4 21.4
select(mtcars, cyl:mpg)
## cyl mpg
## Mazda RX4 6 21.0
## Mazda RX4 Wag 6 21.0
## Datsun 710 4 22.8
## Hornet 4 Drive 6 21.4
## Hornet Sportabout 8 18.7
## Valiant 6 18.1
## Duster 360 8 14.3
## Merc 240D 4 24.4
## Merc 230 4 22.8
## Merc 280 6 19.2
## Merc 280C 6 17.8
## Merc 450SE 8 16.4
## Merc 450SL 8 17.3
## Merc 450SLC 8 15.2
## Cadillac Fleetwood 8 10.4
## Lincoln Continental 8 10.4
## Chrysler Imperial 8 14.7
## Fiat 128 4 32.4
## Honda Civic 4 30.4
## Toyota Corolla 4 33.9
## Toyota Corona 4 21.5
## Dodge Challenger 8 15.5
## AMC Javelin 8 15.2
## Camaro Z28 8 13.3
## Pontiac Firebird 8 19.2
## Fiat X1-9 4 27.3
## Porsche 914-2 4 26.0
## Lotus Europa 4 30.4
## Ford Pantera L 8 15.8
## Ferrari Dino 6 19.7
## Maserati Bora 8 15.0
## Volvo 142E 4 21.4
select(mtcars, -(cyl:mpg))
## disp hp drat wt qsec vs am gear carb hpCyl
## Mazda RX4 160.0 110 3.90 2.620 16.46 0 1 4 4 116
## Mazda RX4 Wag 160.0 110 3.90 2.875 17.02 0 1 4 4 116
## Datsun 710 108.0 93 3.85 2.320 18.61 1 1 4 1 97
## Hornet 4 Drive 258.0 110 3.08 3.215 19.44 1 0 3 1 116
## Hornet Sportabout 360.0 175 3.15 3.440 17.02 0 0 3 2 183
## Valiant 225.0 105 2.76 3.460 20.22 1 0 3 1 111
## Duster 360 360.0 245 3.21 3.570 15.84 0 0 3 4 253
## Merc 240D 146.7 62 3.69 3.190 20.00 1 0 4 2 66
## Merc 230 140.8 95 3.92 3.150 22.90 1 0 4 2 99
## Merc 280 167.6 123 3.92 3.440 18.30 1 0 4 4 129
## Merc 280C 167.6 123 3.92 3.440 18.90 1 0 4 4 129
## Merc 450SE 275.8 180 3.07 4.070 17.40 0 0 3 3 188
## Merc 450SL 275.8 180 3.07 3.730 17.60 0 0 3 3 188
## Merc 450SLC 275.8 180 3.07 3.780 18.00 0 0 3 3 188
## Cadillac Fleetwood 472.0 205 2.93 5.250 17.98 0 0 3 4 213
## Lincoln Continental 460.0 215 3.00 5.424 17.82 0 0 3 4 223
## Chrysler Imperial 440.0 230 3.23 5.345 17.42 0 0 3 4 238
## Fiat 128 78.7 66 4.08 2.200 19.47 1 1 4 1 70
## Honda Civic 75.7 52 4.93 1.615 18.52 1 1 4 2 56
## Toyota Corolla 71.1 65 4.22 1.835 19.90 1 1 4 1 69
## Toyota Corona 120.1 97 3.70 2.465 20.01 1 0 3 1 101
## Dodge Challenger 318.0 150 2.76 3.520 16.87 0 0 3 2 158
## AMC Javelin 304.0 150 3.15 3.435 17.30 0 0 3 2 158
## Camaro Z28 350.0 245 3.73 3.840 15.41 0 0 3 4 253
## Pontiac Firebird 400.0 175 3.08 3.845 17.05 0 0 3 2 183
## Fiat X1-9 79.0 66 4.08 1.935 18.90 1 1 4 1 70
## Porsche 914-2 120.3 91 4.43 2.140 16.70 0 1 5 2 95
## Lotus Europa 95.1 113 3.77 1.513 16.90 1 1 5 2 117
## Ford Pantera L 351.0 264 4.22 3.170 14.50 0 1 5 4 272
## Ferrari Dino 145.0 175 3.62 2.770 15.50 0 1 5 6 181
## Maserati Bora 301.0 335 3.54 3.570 14.60 0 1 5 8 343
## Volvo 142E 121.0 109 4.11 2.780 18.60 1 1 4 2 113
select(mtcars, contains("s"))
## disp qsec vs
## Mazda RX4 160.0 16.46 0
## Mazda RX4 Wag 160.0 17.02 0
## Datsun 710 108.0 18.61 1
## Hornet 4 Drive 258.0 19.44 1
## Hornet Sportabout 360.0 17.02 0
## Valiant 225.0 20.22 1
## Duster 360 360.0 15.84 0
## Merc 240D 146.7 20.00 1
## Merc 230 140.8 22.90 1
## Merc 280 167.6 18.30 1
## Merc 280C 167.6 18.90 1
## Merc 450SE 275.8 17.40 0
## Merc 450SL 275.8 17.60 0
## Merc 450SLC 275.8 18.00 0
## Cadillac Fleetwood 472.0 17.98 0
## Lincoln Continental 460.0 17.82 0
## Chrysler Imperial 440.0 17.42 0
## Fiat 128 78.7 19.47 1
## Honda Civic 75.7 18.52 1
## Toyota Corolla 71.1 19.90 1
## Toyota Corona 120.1 20.01 1
## Dodge Challenger 318.0 16.87 0
## AMC Javelin 304.0 17.30 0
## Camaro Z28 350.0 15.41 0
## Pontiac Firebird 400.0 17.05 0
## Fiat X1-9 79.0 18.90 1
## Porsche 914-2 120.3 16.70 0
## Lotus Europa 95.1 16.90 1
## Ford Pantera L 351.0 14.50 0
## Ferrari Dino 145.0 15.50 0
## Maserati Bora 301.0 14.60 0
## Volvo 142E 121.0 18.60 1
select(mtcars, matches("vs"))
## vs
## Mazda RX4 0
## Mazda RX4 Wag 0
## Datsun 710 1
## Hornet 4 Drive 1
## Hornet Sportabout 0
## Valiant 1
## Duster 360 0
## Merc 240D 1
## Merc 230 1
## Merc 280 1
## Merc 280C 1
## Merc 450SE 0
## Merc 450SL 0
## Merc 450SLC 0
## Cadillac Fleetwood 0
## Lincoln Continental 0
## Chrysler Imperial 0
## Fiat 128 1
## Honda Civic 1
## Toyota Corolla 1
## Toyota Corona 1
## Dodge Challenger 0
## AMC Javelin 0
## Camaro Z28 0
## Pontiac Firebird 0
## Fiat X1-9 1
## Porsche 914-2 0
## Lotus Europa 1
## Ford Pantera L 0
## Ferrari Dino 0
## Maserati Bora 0
## Volvo 142E 1
select(mtcars, starts_with("c"))
## cyl carb
## Mazda RX4 6 4
## Mazda RX4 Wag 6 4
## Datsun 710 4 1
## Hornet 4 Drive 6 1
## Hornet Sportabout 8 2
## Valiant 6 1
## Duster 360 8 4
## Merc 240D 4 2
## Merc 230 4 2
## Merc 280 6 4
## Merc 280C 6 4
## Merc 450SE 8 3
## Merc 450SL 8 3
## Merc 450SLC 8 3
## Cadillac Fleetwood 8 4
## Lincoln Continental 8 4
## Chrysler Imperial 8 4
## Fiat 128 4 1
## Honda Civic 4 2
## Toyota Corolla 4 1
## Toyota Corona 4 1
## Dodge Challenger 8 2
## AMC Javelin 8 2
## Camaro Z28 8 4
## Pontiac Firebird 8 2
## Fiat X1-9 4 1
## Porsche 914-2 4 2
## Lotus Europa 4 2
## Ford Pantera L 8 4
## Ferrari Dino 6 6
## Maserati Bora 8 8
## Volvo 142E 4 2
select(mtcars, ends_with("p"))
## disp hp
## Mazda RX4 160.0 110
## Mazda RX4 Wag 160.0 110
## Datsun 710 108.0 93
## Hornet 4 Drive 258.0 110
## Hornet Sportabout 360.0 175
## Valiant 225.0 105
## Duster 360 360.0 245
## Merc 240D 146.7 62
## Merc 230 140.8 95
## Merc 280 167.6 123
## Merc 280C 167.6 123
## Merc 450SE 275.8 180
## Merc 450SL 275.8 180
## Merc 450SLC 275.8 180
## Cadillac Fleetwood 472.0 205
## Lincoln Continental 460.0 215
## Chrysler Imperial 440.0 230
## Fiat 128 78.7 66
## Honda Civic 75.7 52
## Toyota Corolla 71.1 65
## Toyota Corona 120.1 97
## Dodge Challenger 318.0 150
## AMC Javelin 304.0 150
## Camaro Z28 350.0 245
## Pontiac Firebird 400.0 175
## Fiat X1-9 79.0 66
## Porsche 914-2 120.3 91
## Lotus Europa 95.1 113
## Ford Pantera L 351.0 264
## Ferrari Dino 145.0 175
## Maserati Bora 301.0 335
## Volvo 142E 121.0 109
# num_range example
newCol = mtcars
names(newCol) = c('x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11')
select(newCol, num_range("x", 2:5))
## x2 x3 x4 x5
## Mazda RX4 6 160.0 110 3.90
## Mazda RX4 Wag 6 160.0 110 3.90
## Datsun 710 4 108.0 93 3.85
## Hornet 4 Drive 6 258.0 110 3.08
## Hornet Sportabout 8 360.0 175 3.15
## Valiant 6 225.0 105 2.76
## Duster 360 8 360.0 245 3.21
## Merc 240D 4 146.7 62 3.69
## Merc 230 4 140.8 95 3.92
## Merc 280 6 167.6 123 3.92
## Merc 280C 6 167.6 123 3.92
## Merc 450SE 8 275.8 180 3.07
## Merc 450SL 8 275.8 180 3.07
## Merc 450SLC 8 275.8 180 3.07
## Cadillac Fleetwood 8 472.0 205 2.93
## Lincoln Continental 8 460.0 215 3.00
## Chrysler Imperial 8 440.0 230 3.23
## Fiat 128 4 78.7 66 4.08
## Honda Civic 4 75.7 52 4.93
## Toyota Corolla 4 71.1 65 4.22
## Toyota Corona 4 120.1 97 3.70
## Dodge Challenger 8 318.0 150 2.76
## AMC Javelin 8 304.0 150 3.15
## Camaro Z28 8 350.0 245 3.73
## Pontiac Firebird 8 400.0 175 3.08
## Fiat X1-9 4 79.0 66 4.08
## Porsche 914-2 4 120.3 91 4.43
## Lotus Europa 4 95.1 113 3.77
## Ford Pantera L 8 351.0 264 4.22
## Ferrari Dino 6 145.0 175 3.62
## Maserati Bora 8 301.0 335 3.54
## Volvo 142E 4 121.0 109 4.11
# Mutate example
mtcars <- mutate(mtcars,
hpCyl = hp + cyl,
hpdetrend = hp - mean(hp, na.rm=TRUE))
head(select(mtcars, hp, cyl, hpCyl, hpdetrend))
## hp cyl hpCyl hpdetrend
## Mazda RX4 110 6 116 -36.6875
## Mazda RX4 Wag 110 6 116 -36.6875
## Datsun 710 93 4 97 -53.6875
## Hornet 4 Drive 110 6 116 -36.6875
## Hornet Sportabout 175 8 183 28.3125
## Valiant 105 6 111 -41.6875
# Filter and arrange
filter(mtcars, cyl<8)
## mpg cyl disp hp drat wt qsec vs am gear carb hpCyl
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 116
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 116
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 97
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 116
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 111
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 66
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 99
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 129
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 129
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 70
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 56
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 69
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 101
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 70
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 95
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 117
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 181
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 113
## hpdetrend
## Mazda RX4 -36.6875
## Mazda RX4 Wag -36.6875
## Datsun 710 -53.6875
## Hornet 4 Drive -36.6875
## Valiant -41.6875
## Merc 240D -84.6875
## Merc 230 -51.6875
## Merc 280 -23.6875
## Merc 280C -23.6875
## Fiat 128 -80.6875
## Honda Civic -94.6875
## Toyota Corolla -81.6875
## Toyota Corona -49.6875
## Fiat X1-9 -80.6875
## Porsche 914-2 -55.6875
## Lotus Europa -33.6875
## Ferrari Dino 28.3125
## Volvo 142E -37.6875
filter(mtcars, cyl<8 & hp>500)
## [1] mpg cyl disp hp drat wt qsec
## [8] vs am gear carb hpCyl hpdetrend
## <0 rows> (or 0-length row.names)
arrange(mtcars, desc(cyl), hp)
## mpg cyl disp hp drat wt qsec vs am gear carb hpCyl
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 158
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 158
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 183
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 183
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 188
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 188
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 188
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 213
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4 223
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 238
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 253
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 253
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 272
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 343
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 111
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 116
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 116
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 116
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 129
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 129
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 181
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 56
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 66
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 69
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 70
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 70
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 95
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 97
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 99
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 101
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 113
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 117
## hpdetrend
## Dodge Challenger 3.3125
## AMC Javelin 3.3125
## Hornet Sportabout 28.3125
## Pontiac Firebird 28.3125
## Merc 450SE 33.3125
## Merc 450SL 33.3125
## Merc 450SLC 33.3125
## Cadillac Fleetwood 58.3125
## Lincoln Continental 68.3125
## Chrysler Imperial 83.3125
## Duster 360 98.3125
## Camaro Z28 98.3125
## Ford Pantera L 117.3125
## Maserati Bora 188.3125
## Valiant -41.6875
## Mazda RX4 -36.6875
## Mazda RX4 Wag -36.6875
## Hornet 4 Drive -36.6875
## Merc 280 -23.6875
## Merc 280C -23.6875
## Ferrari Dino 28.3125
## Honda Civic -94.6875
## Merc 240D -84.6875
## Toyota Corolla -81.6875
## Fiat 128 -80.6875
## Fiat X1-9 -80.6875
## Porsche 914-2 -55.6875
## Datsun 710 -53.6875
## Merc 230 -51.6875
## Toyota Corona -49.6875
## Volvo 142E -37.6875
## Lotus Europa -33.6875
# Summarize
disp_statistics <- summarize(mtcars, mean(disp),median(disp))
disp_statistics
## mean(disp) median(disp)
## 1 230.7219 196.3
# Group by and summarize
cylinders <- group_by(mtcars, cyl)
summarize(cylinders, avg_disp = mean(disp), max_hp = max(hp))
## # A tibble: 3 × 3
## cyl avg_disp max_hp
## <dbl> <dbl> <dbl>
## 1 4 105. 113
## 2 6 183. 175
## 3 8 353. 335
# Piping example
group_by(mtcars, cyl) %>%
summarise(avg_disp = mean(disp), max_hp = max(hp))
## # A tibble: 3 × 3
## cyl avg_disp max_hp
## <dbl> <dbl> <dbl>
## 1 4 105. 113
## 2 6 183. 175
## 3 8 353. 335
group_by(mtcars, cyl) %>%
summarise(count = n()) %>%
arrange(desc(count))
## # A tibble: 3 × 2
## cyl count
## <dbl> <int>
## 1 8 14
## 2 4 11
## 3 6 7
# Joins with airlines and airports
flights_joined <- flights %>%
inner_join(airlines, by = "carrier")
flights_with_airport_names <- flights %>%
inner_join(airports, by = c("dest" = "faa"))
named_dests <- flights %>%
group_by(dest) %>%
summarize(num_flights = n()) %>%
arrange(desc(num_flights)) %>%
inner_join(airports, by = c("dest" = "faa")) %>%
rename(airport_name = name)
named_dests
## # A tibble: 101 × 9
## dest num_flights airport_name lat lon alt tz dst tzone
## <chr> <int> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 ORD 17283 Chicago Ohare Intl 42.0 -87.9 668 -6 A Amer…
## 2 ATL 17215 Hartsfield Jackson At… 33.6 -84.4 1026 -5 A Amer…
## 3 LAX 16174 Los Angeles Intl 33.9 -118. 126 -8 A Amer…
## 4 BOS 15508 General Edward Lawren… 42.4 -71.0 19 -5 A Amer…
## 5 MCO 14082 Orlando Intl 28.4 -81.3 96 -5 A Amer…
## 6 CLT 14064 Charlotte Douglas Intl 35.2 -80.9 748 -5 A Amer…
## 7 SFO 13331 San Francisco Intl 37.6 -122. 13 -8 A Amer…
## 8 FLL 12055 Fort Lauderdale Holly… 26.1 -80.2 9 -5 A Amer…
## 9 MIA 11728 Miami Intl 25.8 -80.3 8 -5 A Amer…
## 10 DCA 9705 Ronald Reagan Washing… 38.9 -77.0 15 -5 A Amer…
## # ℹ 91 more rows
flights_weather_joined <- flights %>%
inner_join(weather, by = c("year", "month", "day", "hour", "origin"))
joined_flights <- flights %>%
inner_join(airlines, by = "carrier")
# Drinks dataset
drinks_smaller <- drinks %>%
filter(country %in% c("USA", "China", "Italy", "Saudi Arabia")) %>%
select(-total_litres_of_pure_alcohol) %>%
rename(beer = beer_servings, spirit = spirit_servings, wine = wine_servings)
drinks_smaller
## # A tibble: 4 × 4
## country beer spirit wine
## <chr> <int> <int> <int>
## 1 China 79 192 8
## 2 Italy 85 42 237
## 3 Saudi Arabia 0 5 0
## 4 USA 249 158 84
drinks_smaller_tidy <- drinks_smaller %>%
pivot_longer(names_to = "type",
values_to = "servings",
cols = -country)
drinks_smaller_tidy
## # A tibble: 12 × 3
## country type servings
## <chr> <chr> <int>
## 1 China beer 79
## 2 China spirit 192
## 3 China wine 8
## 4 Italy beer 85
## 5 Italy spirit 42
## 6 Italy wine 237
## 7 Saudi Arabia beer 0
## 8 Saudi Arabia spirit 5
## 9 Saudi Arabia wine 0
## 10 USA beer 249
## 11 USA spirit 158
## 12 USA wine 84
drinks_smaller %>%
pivot_longer(names_to = "type",
values_to = "servings",
cols = c(beer, spirit, wine))
## # A tibble: 12 × 3
## country type servings
## <chr> <chr> <int>
## 1 China beer 79
## 2 China spirit 192
## 3 China wine 8
## 4 Italy beer 85
## 5 Italy spirit 42
## 6 Italy wine 237
## 7 Saudi Arabia beer 0
## 8 Saudi Arabia spirit 5
## 9 Saudi Arabia wine 0
## 10 USA beer 249
## 11 USA spirit 158
## 12 USA wine 84
ggplot(drinks_smaller_tidy, aes(x = country, y = servings, fill = type)) +
geom_col(position = "dodge")
drinks_smaller_tidy %>%
pivot_wider(
names_from = type,
values_from = servings
)
## # A tibble: 4 × 4
## country beer spirit wine
## <chr> <int> <int> <int>
## 1 China 79 192 8
## 2 Italy 85 42 237
## 3 Saudi Arabia 0 5 0
## 4 USA 249 158 84
airline_safety_smaller <- airline_safety %>%
select(airline, starts_with("fatalities"))
airline_safety_smaller
## # A tibble: 56 × 3
## airline fatalities_85_99 fatalities_00_14
## <chr> <int> <int>
## 1 Aer Lingus 0 0
## 2 Aeroflot 128 88
## 3 Aerolineas Argentinas 0 0
## 4 Aeromexico 64 0
## 5 Air Canada 0 0
## 6 Air France 79 337
## 7 Air India 329 158
## 8 Air New Zealand 0 7
## 9 Alaska Airlines 0 88
## 10 Alitalia 50 0
## # ℹ 46 more rows
airline_safety_smaller_tidy <- airline_safety_smaller %>%
pivot_longer(names_to = "fatilities_years",
values_to = "count",
cols = -airline
)
airline_safety_smaller_tidy
## # A tibble: 112 × 3
## airline fatilities_years count
## <chr> <chr> <int>
## 1 Aer Lingus fatalities_85_99 0
## 2 Aer Lingus fatalities_00_14 0
## 3 Aeroflot fatalities_85_99 128
## 4 Aeroflot fatalities_00_14 88
## 5 Aerolineas Argentinas fatalities_85_99 0
## 6 Aerolineas Argentinas fatalities_00_14 0
## 7 Aeromexico fatalities_85_99 64
## 8 Aeromexico fatalities_00_14 0
## 9 Air Canada fatalities_85_99 0
## 10 Air Canada fatalities_00_14 0
## # ℹ 102 more rows
glimpse(nycflights13::flights)
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
summary(nycflights13::flights)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906
## Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## NA's :8255
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000
## 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124 1st Qu.: -17.000
## Median : -2.00 Median :1535 Median :1556 Median : -5.000
## Mean : 12.64 Mean :1502 Mean :1536 Mean : 6.895
## 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945 3rd Qu.: 14.000
## Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
## NA's :8255 NA's :8713 NA's :9430
## carrier flight tailnum origin
## Length:336776 Min. : 1 Length:336776 Length:336776
## Class :character 1st Qu.: 553 Class :character Class :character
## Mode :character Median :1496 Mode :character Mode :character
## Mean :1972
## 3rd Qu.:3465
## Max. :8500
##
## dest air_time distance hour
## Length:336776 Min. : 20.0 Min. : 17 Min. : 1.00
## Class :character 1st Qu.: 82.0 1st Qu.: 502 1st Qu.: 9.00
## Mode :character Median :129.0 Median : 872 Median :13.00
## Mean :150.7 Mean :1040 Mean :13.18
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
## Max. :695.0 Max. :4983 Max. :23.00
## NA's :9430
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00.00
## 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00.00
## Median :29.00 Median :2013-07-03 10:00:00.00
## Mean :26.23 Mean :2013-07-03 05:22:54.64
## 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00.00
## Max. :59.00 Max. :2013-12-31 23:00:00.00
##
sortf <- dplyr::arrange(flights, dplyr::desc(dep_delay))
dplyr::select(sortf, carrier, flight, tailnum, dplyr::everything()) |> head()
## # A tibble: 6 × 19
## carrier flight tailnum year month day dep_time sched_dep_time dep_delay
## <chr> <int> <chr> <int> <int> <int> <int> <int> <dbl>
## 1 HA 51 N384HA 2013 1 9 641 900 1301
## 2 MQ 3535 N504MQ 2013 6 15 1432 1935 1137
## 3 MQ 3695 N517MQ 2013 1 10 1121 1635 1126
## 4 AA 177 N338AA 2013 9 20 1139 1845 1014
## 5 MQ 3075 N665MQ 2013 7 22 845 1600 1005
## 6 DL 2391 N959DL 2013 4 10 1100 1900 960
## # ℹ 10 more variables: arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
maxdep <- max(flights$dep_delay, na.rm = TRUE)
maxdep_id <- which(flights$dep_delay == maxdep)
flights[maxdep_id, 10:12] # shows carrier/flight/tailnum columns region
## # A tibble: 1 × 3
## carrier flight tailnum
## <chr> <int> <chr>
## 1 HA 51 N384HA
flights |>
dplyr::group_by(year, month, day) |>
dplyr::summarise(mean = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # ℹ 355 more rows
not_cancelled <- flights |>
dplyr::filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled <- flights |>
dplyr::filter(!is.na(dep_delay))
not_cancelled |>
dplyr::group_by(year, month, day) |>
dplyr::summarise(mean = mean(dep_delay))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # ℹ 355 more rows
not_cancelled |>
dplyr::group_by(tailnum) |>
dplyr::summarise(avg_arr_delay = mean(arr_delay, na.rm = TRUE), .groups = "drop") |>
dplyr::arrange(avg_arr_delay) |>
head(10)
## # A tibble: 10 × 2
## tailnum avg_arr_delay
## <chr> <dbl>
## 1 N560AS -53
## 2 N315AS -51
## 3 N517AS -40.5
## 4 N592AS -35.5
## 5 N7AYAA -35
## 6 N585AS -34.5
## 7 N915DN -34
## 8 N512AS -31.5
## 9 N594AS -31.3
## 10 N564AS -31
not_cancelled |>
dplyr::group_by(year, month, day) |>
dplyr::summarise(first = min(dep_time, na.rm = TRUE),
last = max(dep_time, na.rm = TRUE),
.groups = "drop") |>
dplyr::arrange(dplyr::desc(last)) |>
head(10)
## # A tibble: 10 × 5
## year month day first last
## <int> <int> <int> <int> <int>
## 1 2013 2 7 27 2400
## 2 2013 2 11 1 2400
## 3 2013 3 15 11 2400
## 4 2013 3 22 37 2400
## 5 2013 3 25 13 2400
## 6 2013 4 2 9 2400
## 7 2013 4 4 14 2400
## 8 2013 4 20 7 2400
## 9 2013 5 21 110 2400
## 10 2013 6 17 2 2400
not_cancelled |>
dplyr::mutate(long_delay = dep_delay > 60) |>
dplyr::group_by(month) |>
dplyr::summarise(proportion = mean(long_delay, na.rm = TRUE),
.groups = "drop") |>
dplyr::arrange(dplyr::desc(proportion))
## # A tibble: 12 × 2
## month proportion
## <int> <dbl>
## 1 7 0.134
## 2 6 0.128
## 3 12 0.0942
## 4 4 0.0916
## 5 3 0.0837
## 6 5 0.0818
## 7 8 0.0796
## 8 2 0.0698
## 9 1 0.0688
## 10 9 0.0490
## 11 10 0.0469
## 12 11 0.0402
flights |>
dplyr::group_by(dest) |>
dplyr::summarise(num_carriers = dplyr::n_distinct(carrier), .groups = "drop") |>
dplyr::arrange(dplyr::desc(num_carriers)) |>
head(10)
## # A tibble: 10 × 2
## dest num_carriers
## <chr> <int>
## 1 ATL 7
## 2 BOS 7
## 3 CLT 7
## 4 ORD 7
## 5 TPA 7
## 6 AUS 6
## 7 DCA 6
## 8 DTW 6
## 9 IAD 6
## 10 MSP 6
# 1) group
by_dest <- dplyr::group_by(flights, dest)
# 2) summarise
delay <- dplyr::summarise(by_dest,
count = dplyr::n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE))
# 3) filter
delay <- dplyr::filter(delay, count > 20, dest != "HNL")
# 4) plot
ggplot2::ggplot(delay, ggplot2::aes(x = dist, y = delay)) +
ggplot2::geom_point(ggplot2::aes(size = count), alpha = 1/3) +
ggplot2::geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# Q9
delays <- flights |>
dplyr::group_by(dest) |>
dplyr::summarise(count = dplyr::n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE),
.groups = "drop") |>
dplyr::filter(count > 20, dest != "HNL")
head(delays)
## # A tibble: 6 × 4
## dest count dist delay
## <chr> <int> <dbl> <dbl>
## 1 ABQ 254 1826 4.38
## 2 ACK 265 199 4.85
## 3 ALB 439 143 14.4
## 4 ATL 17215 757. 11.3
## 5 AUS 2439 1514. 6.02
## 6 AVL 275 584. 8.00