The goal of this tutorial is to use the group_by function to create datasets grouped by a defined variable.
# First we load the library dplyr
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# In this tutorial we are going to work with the iris plant dataset
data("iris")
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
# First we have to define which column we are going to use to group
# In this case we use the species of the plant
my_group <- group_by(iris, Species)
# Now with summarize we define which function we use to group the values
# Check the aggregate tutorial to see different functions that can be used to group data
summarize_each(my_group, funs(mean))
## # A tibble: 3 × 5
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## <fctr> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 5.006 3.428 1.462 0.246
## 2 versicolor 5.936 2.770 4.260 1.326
## 3 virginica 6.588 2.974 5.552 2.026
# In this part we are going to use a dates table
# Check "Read date and time variables" tutorial for further explanation on date variables
Time_table <- read.csv("timetable.csv", stringsAsFactors = FALSE)
Time_table <- mutate(Time_table, DateTime = paste(Date, Time))
Time_table$DateTime <- strptime(Time_table$DateTime, "%d/%m/%y %H:%M:%S")
Time_table$DateTime <- as.POSIXct(Time_table$DateTime)
# Now we have a table with a proper date-time variable
head(Time_table)
## Date Time DateTime
## 1 16/12/06 18:00:00 2006-12-16 18:00:00
## 2 16/12/06 19:00:00 2006-12-16 19:00:00
## 3 16/12/06 20:00:00 2006-12-16 20:00:00
## 4 16/12/06 21:00:00 2006-12-16 21:00:00
## 5 16/12/06 22:00:00 2006-12-16 22:00:00
## 6 16/12/06 23:00:00 2006-12-16 23:00:00
# We are going to use the lubridate library
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
# First we filter the year 2008
Time_table08 <- Time_table[which(year(Time_table$DateTime) == 2008),]
# Now we add a column with the month
# Check the "Read date and time variables" and the "mutate" tutorials
Time_table08 <- mutate(Time_table08, Month = factor(format(DateTime, "%m")))
str(Time_table08)
## 'data.frame': 8783 obs. of 4 variables:
## $ Date : chr "01/01/08" "01/01/08" "01/01/08" "01/01/08" ...
## $ Time : chr "00:00:00" "01:00:00" "02:00:00" "03:00:00" ...
## $ DateTime: POSIXct, format: "2008-01-01 00:00:00" "2008-01-01 01:00:00" ...
## $ Month : Factor w/ 12 levels "01","02","03",..: 1 1 1 1 1 1 1 1 1 1 ...
# We add a numerical value to aggregate
Time_table08$Value <- 2
# Now we group by the month column
my_group <- group_by(Time_table08, Month)
summarize(my_group, Value = sum(Value))
## # A tibble: 12 × 2
## Month Value
## <fctr> <dbl>
## 1 01 1488
## 2 02 1392
## 3 03 1486
## 4 04 1440
## 5 05 1488
## 6 06 1440
## 7 07 1488
## 8 08 1488
## 9 09 1440
## 10 10 1488
## 11 11 1440
## 12 12 1488
# We can also summarize using the average
summarize(my_group, Value = mean(Value))
## # A tibble: 12 × 2
## Month Value
## <fctr> <dbl>
## 1 01 2
## 2 02 2
## 3 03 2
## 4 04 2
## 5 05 2
## 6 06 2
## 7 07 2
## 8 08 2
## 9 09 2
## 10 10 2
## 11 11 2
## 12 12 2
# Now we add a column with the day of the week
# Remember that 1 is sunday, not monday
# Check the "Read date and time variables" and the "mutate" tutorials
Time_table08 <- mutate(Time_table08, week_day = wday(DateTime))
str(Time_table08)
## 'data.frame': 8783 obs. of 6 variables:
## $ Date : chr "01/01/08" "01/01/08" "01/01/08" "01/01/08" ...
## $ Time : chr "00:00:00" "01:00:00" "02:00:00" "03:00:00" ...
## $ DateTime: POSIXct, format: "2008-01-01 00:00:00" "2008-01-01 01:00:00" ...
## $ Month : Factor w/ 12 levels "01","02","03",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Value : num 2 2 2 2 2 2 2 2 2 2 ...
## $ week_day: num 3 3 3 3 3 3 3 3 3 3 ...
# Now we group by the weekday column
my_group <- group_by(Time_table08, week_day)
summarize(my_group, Value = sum(Value))
## # A tibble: 7 × 2
## week_day Value
## <dbl> <dbl>
## 1 1 2494
## 2 2 2496
## 3 3 2544
## 4 4 2544
## 5 5 2496
## 6 6 2496
## 7 7 2496
# We can also summarize using the average
summarize(my_group, Value = mean(Value))
## # A tibble: 7 × 2
## week_day Value
## <dbl> <dbl>
## 1 1 2
## 2 2 2
## 3 3 2
## 4 4 2
## 5 5 2
## 6 6 2
## 7 7 2
# Now we add a column with the day of the month
# Remember that 1 is sunday, not monday
# Check the "Read date and time variables" and the "mutate" tutorials
Time_table08 <- mutate(Time_table08, day = day(DateTime))
str(Time_table08)
## 'data.frame': 8783 obs. of 7 variables:
## $ Date : chr "01/01/08" "01/01/08" "01/01/08" "01/01/08" ...
## $ Time : chr "00:00:00" "01:00:00" "02:00:00" "03:00:00" ...
## $ DateTime: POSIXct, format: "2008-01-01 00:00:00" "2008-01-01 01:00:00" ...
## $ Month : Factor w/ 12 levels "01","02","03",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Value : num 2 2 2 2 2 2 2 2 2 2 ...
## $ week_day: num 3 3 3 3 3 3 3 3 3 3 ...
## $ day : int 1 1 1 1 1 1 1 1 1 1 ...
# Now we group by the day of the month column
my_group <- group_by(Time_table08, day)
# We see that 2008 was a leap year so day 30 and 31 are different
tail(summarize(my_group, Value = sum(Value)))
## # A tibble: 6 × 2
## day Value
## <int> <dbl>
## 1 26 576
## 2 27 576
## 3 28 576
## 4 29 576
## 5 30 526
## 6 31 336
# We can also summarize using the average
summarize(my_group, Value = mean(Value))
## # A tibble: 31 × 2
## day Value
## <int> <dbl>
## 1 1 2
## 2 2 2
## 3 3 2
## 4 4 2
## 5 5 2
## 6 6 2
## 7 7 2
## 8 8 2
## 9 9 2
## 10 10 2
## # ... with 21 more rows
In this tutorial we have learnt how to use the group_by function both on categorical data and different date configurations. This process can be useful when we want to change the granularity of a dataset.