#Data manipulation with tidyverse ##by Reza Baneshi

split-apply-combine example

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.1
## Warning: package 'tidyr' was built under R version 4.1.1
## Warning: package 'readr' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
data(iris)
iris.set=iris[iris$Species=="setosa",-5] #5th column is removed
iris.versi=iris[iris$Species=="versicolor",-5]
iris.virg=iris[iris$Species=="virginica",-5]
mean.set=colMeans(iris.set)
mean.versi=colMeans(iris.versi)
mean.virg=colMeans(iris.virg)
mean.iris=rbind(mean.set,mean.versi,mean.virg)
rownames(mean.iris)=c("setosa","versicolor","virginica")
mean.iris
##            Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa            5.006       3.428        1.462       0.246
## versicolor        5.936       2.770        4.260       1.326
## virginica         6.588       2.974        5.552       2.026

select subset of observations and variables

library(nycflights13)
## Warning: package 'nycflights13' was built under R version 4.1.1
data(flights)
#create anew variable and attach it to data
newvar=transmute(flights,gain=arr_delay-dep_delay,hour=dep_time%/%100,minute=dep_time%%100)
flights=data.frame(flights,newvar)

a=filter(flights,month==1&day==1) #selection of some rows
b=filter(flights,month %in% c(11,12)) #selection of some rows
c=select(b,year,month,day) #selection of some variables
d=select(b,-c(year,month,day)) #selection of some variables
c=arrange(c,year,month,day)#sort samples

byyear=group_by(flights,month)##compute mean by month (group_by)
summarise(byyear,delay=mean(dep_delay,na.rm=TRUE))
## # A tibble: 12 x 2
##    month delay
##    <int> <dbl>
##  1     1 10.0 
##  2     2 10.8 
##  3     3 13.2 
##  4     4 13.9 
##  5     5 13.0 
##  6     6 20.8 
##  7     7 21.7 
##  8     8 12.6 
##  9     9  6.72
## 10    10  6.24
## 11    11  5.44
## 12    12 16.6

select subset of observations and variables 2

a=starwars %>% filter(skin_color == "light", eye_color == "brown") ##select rows
b=starwars %>% select(hair_color, skin_color, eye_color) ##select columns

Combining multiple operations with Pipe

by_dest=group_by(flights,dest)
delay=summarise(by_dest,count=n(),dist=mean(distance,na.rm=TRUE),delay=mean(arr_delay,na.rm=TRUE))
delay=filter(delay,count>20,dest!="HNL")                
ggplot(data=delay,mapping=aes(x=dist,y=delay))+
  geom_point(aes(size=count),alpha=0.33)+
  geom_smooth(se=FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'