Data manipulation in Dplyr (Part1)

R Markdown

This tutorial introduces the dplyr package for general purpose data manipulationThis tutorial will cover the following functions in the dplyr package:

select()

arrange()

filter()

mutate()

transmute()

summarise()

1. select

Select columns with select()

We will select a few columns from mtcars datasets to perform select operation on the data frame.

library(dplyr,quietly = TRUE)

## Warning: package 'dplyr' was built under R version 3.4.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(nycflights13,quietly = TRUE)

## Warning: package 'nycflights13' was built under R version 3.4.3

head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

#### Selecting only the desired columns from a dataframe###
d <- select(mtcars,mpg,disp,drat,am)
head(d)

##                    mpg disp drat am
## Mazda RX4         21.0  160 3.90  1
## Mazda RX4 Wag     21.0  160 3.90  1
## Datsun 710        22.8  108 3.85  1
## Hornet 4 Drive    21.4  258 3.08  0
## Hornet Sportabout 18.7  360 3.15  0
## Valiant           18.1  225 2.76  0

##select a list of columns from wt to carb in my dataframe 
v <- select(mtcars, cyl, wt:carb)
head(v)

##                   cyl    wt  qsec vs am gear carb
## Mazda RX4           6 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       6 2.875 17.02  0  1    4    4
## Datsun 710          4 2.320 18.61  1  1    4    1
## Hornet 4 Drive      6 3.215 19.44  1  0    3    1
## Hornet Sportabout   8 3.440 17.02  0  0    3    2
## Valiant             6 3.460 20.22  1  0    3    1

Select with helpers

## Using Helpers with select helps for some adhoc match requirements
## starts_with
new <- select(iris,contains("Sepal"))
head(new)

##   Sepal.Length Sepal.Width
## 1          5.1         3.5
## 2          4.9         3.0
## 3          4.7         3.2
## 4          4.6         3.1
## 5          5.0         3.6
## 6          5.4         3.9

## ends_with 
new1 <- select(iris,ends_with("Width"))
head(new1)

##   Sepal.Width Petal.Width
## 1         3.5         0.2
## 2         3.0         0.2
## 3         3.2         0.2
## 4         3.1         0.2
## 5         3.6         0.2
## 6         3.9         0.4

## drop variables with -
new2 <- select(iris, -starts_with("Petal"))
head(new2)

##   Sepal.Length Sepal.Width Species
## 1          5.1         3.5  setosa
## 2          4.9         3.0  setosa
## 3          4.7         3.2  setosa
## 4          4.6         3.1  setosa
## 5          5.0         3.6  setosa
## 6          5.4         3.9  setosa

filter rows with filter()

To filter a data frame we need to use a logical vector. For example, if we wanted to filter mtcars by cyl and gear

### To filter on 1 column only
d <- filter(mtcars, cyl == 8)

## Warning: package 'bindrcpp' was built under R version 3.4.2

head(d)

##    mpg cyl  disp  hp drat   wt  qsec vs am gear carb
## 1 18.7   8 360.0 175 3.15 3.44 17.02  0  0    3    2
## 2 14.3   8 360.0 245 3.21 3.57 15.84  0  0    3    4
## 3 16.4   8 275.8 180 3.07 4.07 17.40  0  0    3    3
## 4 17.3   8 275.8 180 3.07 3.73 17.60  0  0    3    3
## 5 15.2   8 275.8 180 3.07 3.78 18.00  0  0    3    3
## 6 10.4   8 472.0 205 2.93 5.25 17.98  0  0    3    4

### To filter on more than one column with a condition
d <- filter(mtcars, hp > 3, gear == 3 | gear == 5)
head(d)

##    mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## 2 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## 3 18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## 4 14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## 5 16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## 6 17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3

### using in parameter for the same functionality
e <- filter(mtcars, disp %in% c(258, 318))
head(e)

##    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## 2 15.5   8  318 150 2.76 3.520 16.87  0  0    3    2

Arrange rows with arrange()

In dplyr ordering a data frame is very easy. We use the arrange() function.

head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

### Arranging first by cyl and then by disp, for mtcars dataset
new4 <- arrange(mtcars, cyl, disp)
head(new4)

##    mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## 2 30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## 3 32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## 4 27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## 5 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## 6 22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1

### arranging in descending order
new5 <- arrange(mtcars, desc(disp))
head(new5)

##    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1 10.4   8  472 205 2.93 5.250 17.98  0  0    3    4
## 2 10.4   8  460 215 3.00 5.424 17.82  0  0    3    4
## 3 14.7   8  440 230 3.23 5.345 17.42  0  0    3    4
## 4 19.2   8  400 175 3.08 3.845 17.05  0  0    3    2
## 5 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## 6 14.3   8  360 245 3.21 3.570 15.84  0  0    3    4

Make new columns with mutate()

using tidyverse package with nycflights3 to use mutate()

library(nycflights13)
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.4.3

## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --

## v ggplot2 2.2.1     v readr   1.1.1
## v tibble  1.3.4     v purrr   0.2.4
## v tidyr   0.7.2     v stringr 1.2.0
## v ggplot2 2.2.1     v forcats 0.2.0

## Warning: package 'ggplot2' was built under R version 3.4.3

## Warning: package 'tibble' was built under R version 3.4.2

## Warning: package 'tidyr' was built under R version 3.4.3

## Warning: package 'readr' was built under R version 3.4.3

## Warning: package 'purrr' was built under R version 3.4.3

## Warning: package 'stringr' was built under R version 3.4.3

## Warning: package 'forcats' was built under R version 3.4.3

## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

## using select to pull desired data
flights_sml <- select(flights, 
                      year:day, 
                      ends_with("delay"), 
                      distance, 
                      air_time
)
## make two new columns named as gain and speed in the same data set
flights_sml

## # A tibble: 336,776 x 7
##     year month   day dep_delay arr_delay distance air_time
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl>
##  1  2013     1     1         2        11     1400      227
##  2  2013     1     1         4        20     1416      227
##  3  2013     1     1         2        33     1089      160
##  4  2013     1     1        -1       -18     1576      183
##  5  2013     1     1        -6       -25      762      116
##  6  2013     1     1        -4        12      719      150
##  7  2013     1     1        -5        19     1065      158
##  8  2013     1     1        -3       -14      229       53
##  9  2013     1     1        -3        -8      944      140
## 10  2013     1     1        -2         8      733      138
## # ... with 336,766 more rows

new6 <- mutate(flights_sml,
       gain = arr_delay - dep_delay,
       speed = distance / air_time * 60
       
)
head(new6)

## # A tibble: 6 x 9
##    year month   day dep_delay arr_delay distance air_time  gain    speed
##   <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl> <dbl>    <dbl>
## 1  2013     1     1         2        11     1400      227     9 370.0441
## 2  2013     1     1         4        20     1416      227    16 374.2731
## 3  2013     1     1         2        33     1089      160    31 408.3750
## 4  2013     1     1        -1       -18     1576      183   -17 516.7213
## 5  2013     1     1        -6       -25      762      116   -19 394.1379
## 6  2013     1     1        -4        12      719      150    16 287.6000

Transmute if you only want to keep new variables

transmute(flights,
          gain = arr_delay - dep_delay,
          hours = air_time / 60,
          gain_per_hour = gain / hours
)

## # A tibble: 336,776 x 3
##     gain     hours gain_per_hour
##    <dbl>     <dbl>         <dbl>
##  1     9 3.7833333      2.378855
##  2    16 3.7833333      4.229075
##  3    31 2.6666667     11.625000
##  4   -17 3.0500000     -5.573770
##  5   -19 1.9333333     -9.827586
##  6    16 2.5000000      6.400000
##  7    24 2.6333333      9.113924
##  8   -11 0.8833333    -12.452830
##  9    -5 2.3333333     -2.142857
## 10    10 2.3000000      4.347826
## # ... with 336,766 more rows

transmute(flights,
          dep_time,
          hour = dep_time %/% 100,
          minute = dep_time %% 100
)

## # A tibble: 336,776 x 3
##    dep_time  hour minute
##       <int> <dbl>  <dbl>
##  1      517     5     17
##  2      533     5     33
##  3      542     5     42
##  4      544     5     44
##  5      554     5     54
##  6      554     5     54
##  7      555     5     55
##  8      557     5     57
##  9      557     5     57
## 10      558     5     58
## # ... with 336,766 more rows

Let’s Summarise() the data now

### finding the mean departure delay of a flight
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))

## # A tibble: 1 x 1
##      delay
##      <dbl>
## 1 12.63907

### finding the delay by year. month and day
### not much useful until we pair it with Group by ###
by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))

## # A tibble: 365 x 4
## # Groups:   year, month [?]
##     year month   day     delay
##    <int> <int> <int>     <dbl>
##  1  2013     1     1 11.548926
##  2  2013     1     2 13.858824
##  3  2013     1     3 10.987832
##  4  2013     1     4  8.951595
##  5  2013     1     5  5.732218
##  6  2013     1     6  7.148014
##  7  2013     1     7  5.417204
##  8  2013     1     8  2.553073
##  9  2013     1     9  2.276477
## 10  2013     1    10  2.844995
## # ... with 355 more rows

### By destination
by_dest <- group_by(flights, dest)
delay <- summarise(by_dest,
                   count = n(),
                   dist = mean(distance, na.rm = TRUE),
                   delay = mean(arr_delay, na.rm = TRUE)
)
head(delay)

## # A tibble: 6 x 4
##    dest count      dist     delay
##   <chr> <int>     <dbl>     <dbl>
## 1   ABQ   254 1826.0000  4.381890
## 2   ACK   265  199.0000  4.852273
## 3   ALB   439  143.0000 14.397129
## 4   ANC     8 3370.0000 -2.500000
## 5   ATL 17215  757.1082 11.300113
## 6   AUS  2439 1514.2530  6.019909

delay <- filter(delay, count > 20, dest != "HNL")
head(delay)

## # A tibble: 6 x 4
##    dest count      dist     delay
##   <chr> <int>     <dbl>     <dbl>
## 1   ABQ   254 1826.0000  4.381890
## 2   ACK   265  199.0000  4.852273
## 3   ALB   439  143.0000 14.397129
## 4   ATL 17215  757.1082 11.300113
## 5   AUS  2439 1514.2530  6.019909
## 6   AVL   275  583.5818  8.003831