Dyplr

Dyplr is a commonly used package for manipulating a dataframe

library('dplyr')

Filter filters the data set:

filter(mtcars, mpg > 30)[1:4,]

##    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1 32.4   4 78.7  66 4.08 2.200 19.47  1  1    4    1
## 2 30.4   4 75.7  52 4.93 1.615 18.52  1  1    4    2
## 3 33.9   4 71.1  65 4.22 1.835 19.90  1  1    4    1
## 4 30.4   4 95.1 113 3.77 1.513 16.90  1  1    5    2

Select returns the dataframe but with only the set columns:

select(mtcars, mpg, cyl, wt)[1:4,]

##                 mpg cyl    wt
## Mazda RX4      21.0   6 2.620
## Mazda RX4 Wag  21.0   6 2.875
## Datsun 710     22.8   4 2.320
## Hornet 4 Drive 21.4   6 3.215

Arrange sorts the rows:

arrange(mtcars, wt)[1:4,]

##    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1 30.4   4 95.1 113 3.77 1.513 16.90  1  1    5    2
## 2 30.4   4 75.7  52 4.93 1.615 18.52  1  1    4    2
## 3 33.9   4 71.1  65 4.22 1.835 19.90  1  1    4    1
## 4 27.3   4 79.0  66 4.08 1.935 18.90  1  1    4    1

Mutate Creates new columns:

mutate(mtcars, double_wt = wt*2)[1:4,]

##    mpg cyl disp  hp drat    wt  qsec vs am gear carb double_wt
## 1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4      5.24
## 2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4      5.75
## 3 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1      4.64
## 4 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1      6.43

Summarise provides summaries of grouped rows, often used with the pipeline operator.

mtcars %>%
  summarise(mean_mpg = mean(mpg),
            num_cars = n(), #dont need to specify as count just counts observations
            sd_wt = sd(wt))

##   mean_mpg num_cars     sd_wt
## 1 20.09062       32 0.9784574

We can use the pipeline operator %>% ‘and then’ to nest dpylr functions

new_table <- mtcars %>%
  mutate(double_wt = wt*2, # Creates a new variable called double_wt
         kms_per_gallon = mpg*1.61, # Creates a new variable called kms_per_gallon
         name = row.names(mtcars)) %>%  # Creates a new variable with names of the cars
  select(name, double_wt, kms_per_gallon);
new_table[1:4,]

##             name double_wt kms_per_gallon
## 1      Mazda RX4      5.24         33.810
## 2  Mazda RX4 Wag      5.75         33.810
## 3     Datsun 710      4.64         36.708
## 4 Hornet 4 Drive      6.43         34.454

group_by can be used to group rows into different categories. This is especially powerful if we summarise these groups. Summary statistics often used are mean(), sd() and n() (count).

mtcars %>%
  group_by(cyl) %>% # Group cars by how many cylinders they have
  summarise(avg_mpg = mean(mpg)) # Find the average mpg for each group (cyl)

## # A tibble: 3 x 2
##     cyl avg_mpg
##   <dbl>   <dbl>
## 1     4    26.7
## 2     6    19.7
## 3     8    15.1

To create a frequency table with set intervals:

mtcars %>% 
  mutate(ints = cut(mpg, breaks= c(20,30,31,32,33,50), right = F)) %>%
  group_by(ints) %>%
  summarise(n = n())

## Warning: Factor `ints` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## # A tibble: 5 x 2
##   ints        n
##   <fct>   <int>
## 1 [20,30)    10
## 2 [30,31)     2
## 3 [32,33)     1
## 4 [33,50)     1
## 5 <NA>       18

Dyplr

Jake

15/05/2021