This tuturial is the part of the dplyr training series. Here is the YouTube Video link

Why dplyr

dplyr is a great tool to use in R.

The commands may look long and overwhelming to someone not using dplyr but that is not the case.

Once you learn the basics of it then it is very intuitive to use. Just like making a sentence once you have learnt the basic words of a language.

Audience

For beginners or experienced R users wanting to learn various commands of dplyr.

DPLYR : summarise or summarize

We will be covering all practical aspects of dplyr::summarise command in this. This tutorial is part of a series of tutorials on all practical aspects of dplyr All youtube videos are available in a single playlist on YouTube.

https://www.youtube.com/playlist?list=PLkHcMTpvAaXVJzyRSytUn3nSK92TJphxR

Here it the link to this video Video link https://youtu.be/AID9TSzcMF8

Create sample dataset

We will be using the built in dataset called diamonds in this tutuorial

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Have a look at the sample dataset

d <- diamonds
d

Use mutate command to calculte mean

d1 <- d %>%
      dplyr::select(cut,price)%>%
      dplyr::mutate(mean_price = mean(price))

d1

Now use the summarise command

d2 <- d %>%
      dplyr::select(cut,price)%>%
      dplyr::summarise(mean_price = mean(price))

d2

summarise with group by

d3 <- d %>%
      dplyr::select(cut,price)%>%
      dplyr::group_by(cut)%>% 
      dplyr::summarise(mean_price = mean(price))

d3
    d %>%
       dplyr::select(cut,price)%>%
        dplyr::group_by(cut)%>% 
         dplyr::summarise(mean_price = mean(price))%>%
          ggplot(aes(x = cut, y= mean_price)) + geom_bar(stat= "identity")

Have more than 1 aggregate using summarise

d4 <- d %>%
      dplyr::select(cut,price)%>%
      dplyr::group_by(cut)%>% 
      dplyr::summarise(mean_price = mean(price)
                       , n = n())%>%
     dplyr::relocate(n, .after = cut)
    

d4
d5 <- d %>%
      dplyr::select(cut,price)%>%
      dplyr::group_by(cut)%>% 
      dplyr::summarise(mean_price = mean(price)
                       , n         = n()
                       , sd = sd(price))%>%
      dplyr::relocate(n, .after = cut)
    

d5
d6 <- d %>%
      dplyr::select(cut,price)%>%
      dplyr::group_by(cut)%>%
      dplyr::summarise(min_price = min(price)
                       , max_price = max(price)
                       , mean_price = mean(price)
                       , median = median(price)
                       , q1   = quantile(price , c(0.25))
                       , q2_aka_median   = quantile(price , c(0.5))
                       , q3   = quantile(price , c(0.75))
                       )
d6
# d6 <- d %>%
#       dplyr::select(cut,price)%>%
#       dplyr::group_by(cut)%>%
#       dplyr::summarise(min_price = min(price))%>%
#       dplyr::summarise(max_price = max(price))
# d6

#You would get the error now as the second summarise command will not find the price
#Error: Problem with `summarise()` column `max_price`.
#i `max_price = max(price)`.
#x object 'price' not found
d7 <- d %>%
      dplyr::select(cut,price)%>%
      dplyr::group_by(cut)%>%
      dplyr::summarise(min = fivenum(price)[1]
                       , q1 = fivenum(price)[2]
                       , median = fivenum(price)[3]
                       , mean = fivenum(price)[4]
                       , q3 = fivenum(price)[5]
                       , max = fivenum(price)[6]
                      )

d7
#summary(d$price)
#summary() gives the following 6  summary statistics: 
# [Minimum] [1st Quartile] [Median] [Mean] [3rd Quartile] [Maximum]

#fivenum(d$price)
#Tukey's five number summary 
# [Minimum] [Lower hinge] [Median] [Upper hinge] [Maximum]
#Lower Hinge = median of the values to the left of Actual Median
#Upper hinge = median of the values to the right of Actual Median

#fivenum(d$price)

Rowwise summary instead of column wise

d8 <- d %>% 
      dplyr::rowwise()%>%
      dplyr::summarise(mean = mean(c(x,y,z)))
d8
d8 <- d %>% 
      dplyr::rowwise()%>%
      dplyr::mutate(mean = mean(c(x,y,z)))
d8

rowwise summarising

dtry <- data.frame(PatientID = c('P1','P2','P3','P4','P1','P2','P3')
                   ,marker1  =     c(20  ,21 ,23  ,23  ,36  ,24  ,34)
                   ,marker2  =     c(18  ,24 ,22  ,24  ,33  ,34  ,22)
                   ,marker3  =     c(26  ,23 ,23  ,24  ,31  ,40  ,18)
                   )

# This command will not work
#d9 <- dtry%>%
#  dplyr::group_by(PatientID)%>%
#      dplyr::mutate(mean = mean(marker1, marker2, marker3))

# This command will not work
#d9 <- dtry%>%
#  dplyr::group_by(PatientID)%>%
#      dplyr::summarise(mean = mean(marker1, marker2, marker3))


d9 <- dtry%>%
     dplyr::group_by(PatientID)%>%
      dplyr::rowwise()%>%
      dplyr::mutate(mean = mean(c(marker1, marker2, marker3)))


# Rowise summary
d9 <- dtry%>%
     dplyr::group_by(PatientID)%>%
      dplyr::rowwise()%>%
      dplyr::summarise(mean = mean(c(marker1, marker2, marker3)))
## `summarise()` has grouped output by 'PatientID'. You can override using the
## `.groups` argument.
# column wise summary
d9 <- d9%>%
     dplyr::group_by(PatientID)%>%
      dplyr::summarise(mean = mean(mean))