We will select a few columns from mtcars datasets to perform select operation on the data frame.
library(dplyr,quietly = TRUE)
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(nycflights13,quietly = TRUE)
## Warning: package 'nycflights13' was built under R version 3.4.3
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
#### Selecting only the desired columns from a dataframe###
d <- select(mtcars,mpg,disp,drat,am)
head(d)
## mpg disp drat am
## Mazda RX4 21.0 160 3.90 1
## Mazda RX4 Wag 21.0 160 3.90 1
## Datsun 710 22.8 108 3.85 1
## Hornet 4 Drive 21.4 258 3.08 0
## Hornet Sportabout 18.7 360 3.15 0
## Valiant 18.1 225 2.76 0
##select a list of columns from wt to carb in my dataframe
v <- select(mtcars, cyl, wt:carb)
head(v)
## cyl wt qsec vs am gear carb
## Mazda RX4 6 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 6 2.875 17.02 0 1 4 4
## Datsun 710 4 2.320 18.61 1 1 4 1
## Hornet 4 Drive 6 3.215 19.44 1 0 3 1
## Hornet Sportabout 8 3.440 17.02 0 0 3 2
## Valiant 6 3.460 20.22 1 0 3 1
## Using Helpers with select helps for some adhoc match requirements
## starts_with
new <- select(iris,contains("Sepal"))
head(new)
## Sepal.Length Sepal.Width
## 1 5.1 3.5
## 2 4.9 3.0
## 3 4.7 3.2
## 4 4.6 3.1
## 5 5.0 3.6
## 6 5.4 3.9
## ends_with
new1 <- select(iris,ends_with("Width"))
head(new1)
## Sepal.Width Petal.Width
## 1 3.5 0.2
## 2 3.0 0.2
## 3 3.2 0.2
## 4 3.1 0.2
## 5 3.6 0.2
## 6 3.9 0.4
## drop variables with -
new2 <- select(iris, -starts_with("Petal"))
head(new2)
## Sepal.Length Sepal.Width Species
## 1 5.1 3.5 setosa
## 2 4.9 3.0 setosa
## 3 4.7 3.2 setosa
## 4 4.6 3.1 setosa
## 5 5.0 3.6 setosa
## 6 5.4 3.9 setosa
To filter a data frame we need to use a logical vector. For example, if we wanted to filter mtcars by cyl and gear
### To filter on 1 column only
d <- filter(mtcars, cyl == 8)
## Warning: package 'bindrcpp' was built under R version 3.4.2
head(d)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 18.7 8 360.0 175 3.15 3.44 17.02 0 0 3 2
## 2 14.3 8 360.0 245 3.21 3.57 15.84 0 0 3 4
## 3 16.4 8 275.8 180 3.07 4.07 17.40 0 0 3 3
## 4 17.3 8 275.8 180 3.07 3.73 17.60 0 0 3 3
## 5 15.2 8 275.8 180 3.07 3.78 18.00 0 0 3 3
## 6 10.4 8 472.0 205 2.93 5.25 17.98 0 0 3 4
### To filter on more than one column with a condition
d <- filter(mtcars, hp > 3, gear == 3 | gear == 5)
head(d)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 2 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 3 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 4 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## 5 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 6 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
### using in parameter for the same functionality
e <- filter(mtcars, disp %in% c(258, 318))
head(e)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## 2 15.5 8 318 150 2.76 3.520 16.87 0 0 3 2
In dplyr ordering a data frame is very easy. We use the arrange() function.
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
### Arranging first by cyl and then by disp, for mtcars dataset
new4 <- arrange(mtcars, cyl, disp)
head(new4)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 2 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 3 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 4 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 5 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 6 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
### arranging in descending order
new5 <- arrange(mtcars, desc(disp))
head(new5)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 10.4 8 472 205 2.93 5.250 17.98 0 0 3 4
## 2 10.4 8 460 215 3.00 5.424 17.82 0 0 3 4
## 3 14.7 8 440 230 3.23 5.345 17.42 0 0 3 4
## 4 19.2 8 400 175 3.08 3.845 17.05 0 0 3 2
## 5 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## 6 14.3 8 360 245 3.21 3.570 15.84 0 0 3 4
using tidyverse package with nycflights3 to use mutate()
library(nycflights13)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.3
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v readr 1.1.1
## v tibble 1.3.4 v purrr 0.2.4
## v tidyr 0.7.2 v stringr 1.2.0
## v ggplot2 2.2.1 v forcats 0.2.0
## Warning: package 'ggplot2' was built under R version 3.4.3
## Warning: package 'tibble' was built under R version 3.4.2
## Warning: package 'tidyr' was built under R version 3.4.3
## Warning: package 'readr' was built under R version 3.4.3
## Warning: package 'purrr' was built under R version 3.4.3
## Warning: package 'stringr' was built under R version 3.4.3
## Warning: package 'forcats' was built under R version 3.4.3
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## using select to pull desired data
flights_sml <- select(flights,
year:day,
ends_with("delay"),
distance,
air_time
)
## make two new columns named as gain and speed in the same data set
flights_sml
## # A tibble: 336,776 x 7
## year month day dep_delay arr_delay distance air_time
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227
## 2 2013 1 1 4 20 1416 227
## 3 2013 1 1 2 33 1089 160
## 4 2013 1 1 -1 -18 1576 183
## 5 2013 1 1 -6 -25 762 116
## 6 2013 1 1 -4 12 719 150
## 7 2013 1 1 -5 19 1065 158
## 8 2013 1 1 -3 -14 229 53
## 9 2013 1 1 -3 -8 944 140
## 10 2013 1 1 -2 8 733 138
## # ... with 336,766 more rows
new6 <- mutate(flights_sml,
gain = arr_delay - dep_delay,
speed = distance / air_time * 60
)
head(new6)
## # A tibble: 6 x 9
## year month day dep_delay arr_delay distance air_time gain speed
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227 9 370.0441
## 2 2013 1 1 4 20 1416 227 16 374.2731
## 3 2013 1 1 2 33 1089 160 31 408.3750
## 4 2013 1 1 -1 -18 1576 183 -17 516.7213
## 5 2013 1 1 -6 -25 762 116 -19 394.1379
## 6 2013 1 1 -4 12 719 150 16 287.6000
transmute(flights,
gain = arr_delay - dep_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
## # A tibble: 336,776 x 3
## gain hours gain_per_hour
## <dbl> <dbl> <dbl>
## 1 9 3.7833333 2.378855
## 2 16 3.7833333 4.229075
## 3 31 2.6666667 11.625000
## 4 -17 3.0500000 -5.573770
## 5 -19 1.9333333 -9.827586
## 6 16 2.5000000 6.400000
## 7 24 2.6333333 9.113924
## 8 -11 0.8833333 -12.452830
## 9 -5 2.3333333 -2.142857
## 10 10 2.3000000 4.347826
## # ... with 336,766 more rows
transmute(flights,
dep_time,
hour = dep_time %/% 100,
minute = dep_time %% 100
)
## # A tibble: 336,776 x 3
## dep_time hour minute
## <int> <dbl> <dbl>
## 1 517 5 17
## 2 533 5 33
## 3 542 5 42
## 4 544 5 44
## 5 554 5 54
## 6 554 5 54
## 7 555 5 55
## 8 557 5 57
## 9 557 5 57
## 10 558 5 58
## # ... with 336,766 more rows
### finding the mean departure delay of a flight
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 1 x 1
## delay
## <dbl>
## 1 12.63907
### finding the delay by year. month and day
### not much useful until we pair it with Group by ###
by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 365 x 4
## # Groups: year, month [?]
## year month day delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.548926
## 2 2013 1 2 13.858824
## 3 2013 1 3 10.987832
## 4 2013 1 4 8.951595
## 5 2013 1 5 5.732218
## 6 2013 1 6 7.148014
## 7 2013 1 7 5.417204
## 8 2013 1 8 2.553073
## 9 2013 1 9 2.276477
## 10 2013 1 10 2.844995
## # ... with 355 more rows
### By destination
by_dest <- group_by(flights, dest)
delay <- summarise(by_dest,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
)
head(delay)
## # A tibble: 6 x 4
## dest count dist delay
## <chr> <int> <dbl> <dbl>
## 1 ABQ 254 1826.0000 4.381890
## 2 ACK 265 199.0000 4.852273
## 3 ALB 439 143.0000 14.397129
## 4 ANC 8 3370.0000 -2.500000
## 5 ATL 17215 757.1082 11.300113
## 6 AUS 2439 1514.2530 6.019909
delay <- filter(delay, count > 20, dest != "HNL")
head(delay)
## # A tibble: 6 x 4
## dest count dist delay
## <chr> <int> <dbl> <dbl>
## 1 ABQ 254 1826.0000 4.381890
## 2 ACK 265 199.0000 4.852273
## 3 ALB 439 143.0000 14.397129
## 4 ATL 17215 757.1082 11.300113
## 5 AUS 2439 1514.2530 6.019909
## 6 AVL 275 583.5818 8.003831