1 Goal

The goal of this tutorial is to use the group_by function to create datasets grouped by a defined variable.

2 Group By Function

# First we load the library dplyr
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# In this tutorial we are going to work with the iris plant dataset
data("iris")
str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

# First we have to define which column we are going to use to group
# In this case we use the species of the plant
my_group <- group_by(iris, Species)

# Now with summarize we define which function we use to group the values
# Check the aggregate tutorial to see different functions that can be used to group data
summarize_each(my_group, funs(mean))

## # A tibble: 3 × 5
##      Species Sepal.Length Sepal.Width Petal.Length Petal.Width
##       <fctr>        <dbl>       <dbl>        <dbl>       <dbl>
## 1     setosa        5.006       3.428        1.462       0.246
## 2 versicolor        5.936       2.770        4.260       1.326
## 3  virginica        6.588       2.974        5.552       2.026

3 Group by dates

3.1 Data preparation

# In this part we are going to use a dates table
# Check "Read date and time variables" tutorial for further explanation on date variables
Time_table <- read.csv("timetable.csv", stringsAsFactors = FALSE)
Time_table <- mutate(Time_table, DateTime = paste(Date, Time))
Time_table$DateTime <- strptime(Time_table$DateTime, "%d/%m/%y %H:%M:%S")
Time_table$DateTime <- as.POSIXct(Time_table$DateTime)

# Now we have a table with a proper date-time variable
head(Time_table)

##       Date     Time            DateTime
## 1 16/12/06 18:00:00 2006-12-16 18:00:00
## 2 16/12/06 19:00:00 2006-12-16 19:00:00
## 3 16/12/06 20:00:00 2006-12-16 20:00:00
## 4 16/12/06 21:00:00 2006-12-16 21:00:00
## 5 16/12/06 22:00:00 2006-12-16 22:00:00
## 6 16/12/06 23:00:00 2006-12-16 23:00:00

3.2 Group by Month

# We are going to use the lubridate library
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

# First we filter the year 2008
Time_table08 <- Time_table[which(year(Time_table$DateTime) == 2008),]

# Now we add a column with the month
# Check the "Read date and time variables" and the "mutate" tutorials
Time_table08 <- mutate(Time_table08, Month = factor(format(DateTime, "%m")))
str(Time_table08)

## 'data.frame':    8783 obs. of  4 variables:
##  $ Date    : chr  "01/01/08" "01/01/08" "01/01/08" "01/01/08" ...
##  $ Time    : chr  "00:00:00" "01:00:00" "02:00:00" "03:00:00" ...
##  $ DateTime: POSIXct, format: "2008-01-01 00:00:00" "2008-01-01 01:00:00" ...
##  $ Month   : Factor w/ 12 levels "01","02","03",..: 1 1 1 1 1 1 1 1 1 1 ...

# We add a numerical value to aggregate
Time_table08$Value <- 2

# Now we group by the month column
my_group <- group_by(Time_table08, Month)
summarize(my_group, Value = sum(Value))

## # A tibble: 12 × 2
##     Month Value
##    <fctr> <dbl>
## 1      01  1488
## 2      02  1392
## 3      03  1486
## 4      04  1440
## 5      05  1488
## 6      06  1440
## 7      07  1488
## 8      08  1488
## 9      09  1440
## 10     10  1488
## 11     11  1440
## 12     12  1488

# We can also summarize using the average
summarize(my_group, Value = mean(Value))

## # A tibble: 12 × 2
##     Month Value
##    <fctr> <dbl>
## 1      01     2
## 2      02     2
## 3      03     2
## 4      04     2
## 5      05     2
## 6      06     2
## 7      07     2
## 8      08     2
## 9      09     2
## 10     10     2
## 11     11     2
## 12     12     2

3.3 Group by week day

# Now we add a column with the day of the week
# Remember that 1 is sunday, not monday
# Check the "Read date and time variables" and the "mutate" tutorials
Time_table08 <- mutate(Time_table08, week_day = wday(DateTime))
str(Time_table08)

## 'data.frame':    8783 obs. of  6 variables:
##  $ Date    : chr  "01/01/08" "01/01/08" "01/01/08" "01/01/08" ...
##  $ Time    : chr  "00:00:00" "01:00:00" "02:00:00" "03:00:00" ...
##  $ DateTime: POSIXct, format: "2008-01-01 00:00:00" "2008-01-01 01:00:00" ...
##  $ Month   : Factor w/ 12 levels "01","02","03",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Value   : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ week_day: num  3 3 3 3 3 3 3 3 3 3 ...

# Now we group by the weekday column
my_group <- group_by(Time_table08, week_day)
summarize(my_group, Value = sum(Value))

## # A tibble: 7 × 2
##   week_day Value
##      <dbl> <dbl>
## 1        1  2494
## 2        2  2496
## 3        3  2544
## 4        4  2544
## 5        5  2496
## 6        6  2496
## 7        7  2496

# We can also summarize using the average
summarize(my_group, Value = mean(Value))

## # A tibble: 7 × 2
##   week_day Value
##      <dbl> <dbl>
## 1        1     2
## 2        2     2
## 3        3     2
## 4        4     2
## 5        5     2
## 6        6     2
## 7        7     2

3.4 Group by month day

# Now we add a column with the day of the month
# Remember that 1 is sunday, not monday
# Check the "Read date and time variables" and the "mutate" tutorials
Time_table08 <- mutate(Time_table08, day = day(DateTime))
str(Time_table08)

## 'data.frame':    8783 obs. of  7 variables:
##  $ Date    : chr  "01/01/08" "01/01/08" "01/01/08" "01/01/08" ...
##  $ Time    : chr  "00:00:00" "01:00:00" "02:00:00" "03:00:00" ...
##  $ DateTime: POSIXct, format: "2008-01-01 00:00:00" "2008-01-01 01:00:00" ...
##  $ Month   : Factor w/ 12 levels "01","02","03",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Value   : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ week_day: num  3 3 3 3 3 3 3 3 3 3 ...
##  $ day     : int  1 1 1 1 1 1 1 1 1 1 ...

# Now we group by the day of the month column
my_group <- group_by(Time_table08, day)

# We see that 2008 was a leap year so day 30 and 31 are different
tail(summarize(my_group, Value = sum(Value)))

## # A tibble: 6 × 2
##     day Value
##   <int> <dbl>
## 1    26   576
## 2    27   576
## 3    28   576
## 4    29   576
## 5    30   526
## 6    31   336

# We can also summarize using the average
summarize(my_group, Value = mean(Value))

## # A tibble: 31 × 2
##      day Value
##    <int> <dbl>
## 1      1     2
## 2      2     2
## 3      3     2
## 4      4     2
## 5      5     2
## 6      6     2
## 7      7     2
## 8      8     2
## 9      9     2
## 10    10     2
## # ... with 21 more rows

4 Conclusion

In this tutorial we have learnt how to use the group_by function both on categorical data and different date configurations. This process can be useful when we want to change the granularity of a dataset.

Group by function

Luis Serra @ Ubiqum Code Academy

1 Goal

2 Group By Function

3 Group by dates

3.1 Data preparation

3.2 Group by Month

3.3 Group by week day

3.4 Group by month day

4 Conclusion