#Data manipulation with tidyverse ##by Reza Baneshi
split-apply-combine example
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.1
## Warning: package 'tidyr' was built under R version 4.1.1
## Warning: package 'readr' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data(iris)
iris.set=iris[iris$Species=="setosa",-5] #5th column is removed
iris.versi=iris[iris$Species=="versicolor",-5]
iris.virg=iris[iris$Species=="virginica",-5]
mean.set=colMeans(iris.set)
mean.versi=colMeans(iris.versi)
mean.virg=colMeans(iris.virg)
mean.iris=rbind(mean.set,mean.versi,mean.virg)
rownames(mean.iris)=c("setosa","versicolor","virginica")
mean.iris
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa 5.006 3.428 1.462 0.246
## versicolor 5.936 2.770 4.260 1.326
## virginica 6.588 2.974 5.552 2.026
select subset of observations and variables
library(nycflights13)
## Warning: package 'nycflights13' was built under R version 4.1.1
data(flights)
#create anew variable and attach it to data
newvar=transmute(flights,gain=arr_delay-dep_delay,hour=dep_time%/%100,minute=dep_time%%100)
flights=data.frame(flights,newvar)
a=filter(flights,month==1&day==1) #selection of some rows
b=filter(flights,month %in% c(11,12)) #selection of some rows
c=select(b,year,month,day) #selection of some variables
d=select(b,-c(year,month,day)) #selection of some variables
c=arrange(c,year,month,day)#sort samples
byyear=group_by(flights,month)##compute mean by month (group_by)
summarise(byyear,delay=mean(dep_delay,na.rm=TRUE))
## # A tibble: 12 x 2
## month delay
## <int> <dbl>
## 1 1 10.0
## 2 2 10.8
## 3 3 13.2
## 4 4 13.9
## 5 5 13.0
## 6 6 20.8
## 7 7 21.7
## 8 8 12.6
## 9 9 6.72
## 10 10 6.24
## 11 11 5.44
## 12 12 16.6
select subset of observations and variables 2
a=starwars %>% filter(skin_color == "light", eye_color == "brown") ##select rows
b=starwars %>% select(hair_color, skin_color, eye_color) ##select columns
Combining multiple operations with Pipe
by_dest=group_by(flights,dest)
delay=summarise(by_dest,count=n(),dist=mean(distance,na.rm=TRUE),delay=mean(arr_delay,na.rm=TRUE))
delay=filter(delay,count>20,dest!="HNL")
ggplot(data=delay,mapping=aes(x=dist,y=delay))+
geom_point(aes(size=count),alpha=0.33)+
geom_smooth(se=FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'