This tuturial is the part of the dplyr training series. Here is the YouTube Video link
dplyr is a great tool to use in R.
The commands may look long and overwhelming to someone not using dplyr but that is not the case.
Once you learn the basics of it then it is very intuitive to use. Just like making a sentence once you have learnt the basic words of a language.
For beginners or experienced R users wanting to learn various commands of dplyr.
We will be covering all practical aspects of dplyr::summarise command in this. This tutorial is part of a series of tutorials on all practical aspects of dplyr All youtube videos are available in a single playlist on YouTube.
https://www.youtube.com/playlist?list=PLkHcMTpvAaXVJzyRSytUn3nSK92TJphxR
Here it the link to this video Video link https://youtu.be/AID9TSzcMF8
We will be using the built in dataset called diamonds in this tutuorial
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
d <- diamonds
d
d1 <- d %>%
dplyr::select(cut,price)%>%
dplyr::mutate(mean_price = mean(price))
d1
d2 <- d %>%
dplyr::select(cut,price)%>%
dplyr::summarise(mean_price = mean(price))
d2
d3 <- d %>%
dplyr::select(cut,price)%>%
dplyr::group_by(cut)%>%
dplyr::summarise(mean_price = mean(price))
d3
d %>%
dplyr::select(cut,price)%>%
dplyr::group_by(cut)%>%
dplyr::summarise(mean_price = mean(price))%>%
ggplot(aes(x = cut, y= mean_price)) + geom_bar(stat= "identity")
d4 <- d %>%
dplyr::select(cut,price)%>%
dplyr::group_by(cut)%>%
dplyr::summarise(mean_price = mean(price)
, n = n())%>%
dplyr::relocate(n, .after = cut)
d4
d5 <- d %>%
dplyr::select(cut,price)%>%
dplyr::group_by(cut)%>%
dplyr::summarise(mean_price = mean(price)
, n = n()
, sd = sd(price))%>%
dplyr::relocate(n, .after = cut)
d5
d6 <- d %>%
dplyr::select(cut,price)%>%
dplyr::group_by(cut)%>%
dplyr::summarise(min_price = min(price)
, max_price = max(price)
, mean_price = mean(price)
, median = median(price)
, q1 = quantile(price , c(0.25))
, q2_aka_median = quantile(price , c(0.5))
, q3 = quantile(price , c(0.75))
)
d6
# d6 <- d %>%
# dplyr::select(cut,price)%>%
# dplyr::group_by(cut)%>%
# dplyr::summarise(min_price = min(price))%>%
# dplyr::summarise(max_price = max(price))
# d6
#You would get the error now as the second summarise command will not find the price
#Error: Problem with `summarise()` column `max_price`.
#i `max_price = max(price)`.
#x object 'price' not found
d7 <- d %>%
dplyr::select(cut,price)%>%
dplyr::group_by(cut)%>%
dplyr::summarise(min = fivenum(price)[1]
, q1 = fivenum(price)[2]
, median = fivenum(price)[3]
, mean = fivenum(price)[4]
, q3 = fivenum(price)[5]
, max = fivenum(price)[6]
)
d7
#summary(d$price)
#summary() gives the following 6 summary statistics:
# [Minimum] [1st Quartile] [Median] [Mean] [3rd Quartile] [Maximum]
#fivenum(d$price)
#Tukey's five number summary
# [Minimum] [Lower hinge] [Median] [Upper hinge] [Maximum]
#Lower Hinge = median of the values to the left of Actual Median
#Upper hinge = median of the values to the right of Actual Median
#fivenum(d$price)
d8 <- d %>%
dplyr::rowwise()%>%
dplyr::summarise(mean = mean(c(x,y,z)))
d8
d8 <- d %>%
dplyr::rowwise()%>%
dplyr::mutate(mean = mean(c(x,y,z)))
d8
dtry <- data.frame(PatientID = c('P1','P2','P3','P4','P1','P2','P3')
,marker1 = c(20 ,21 ,23 ,23 ,36 ,24 ,34)
,marker2 = c(18 ,24 ,22 ,24 ,33 ,34 ,22)
,marker3 = c(26 ,23 ,23 ,24 ,31 ,40 ,18)
)
# This command will not work
#d9 <- dtry%>%
# dplyr::group_by(PatientID)%>%
# dplyr::mutate(mean = mean(marker1, marker2, marker3))
# This command will not work
#d9 <- dtry%>%
# dplyr::group_by(PatientID)%>%
# dplyr::summarise(mean = mean(marker1, marker2, marker3))
d9 <- dtry%>%
dplyr::group_by(PatientID)%>%
dplyr::rowwise()%>%
dplyr::mutate(mean = mean(c(marker1, marker2, marker3)))
# Rowise summary
d9 <- dtry%>%
dplyr::group_by(PatientID)%>%
dplyr::rowwise()%>%
dplyr::summarise(mean = mean(c(marker1, marker2, marker3)))
## `summarise()` has grouped output by 'PatientID'. You can override using the
## `.groups` argument.
# column wise summary
d9 <- d9%>%
dplyr::group_by(PatientID)%>%
dplyr::summarise(mean = mean(mean))