Examples of dplyr package in R

Hsin-Yu Cheng
Update : July 03, 2015

Resources

  1. RStudio Cheatsheet
  2. Cran Introduction
  3. swirl package in R
install.packages("swirl")
library(swirl)
install_from_swirl("Getting and Cleaning Data") 
swirl()
  • From Coursera course : Getting and Cleaning Data.
  • Requirement
    • It requires R 3.0.2 version or later. If you’re not sure what version of R you have, type R.version.string at the R prompt.
    • If you are on a Linux operating system, please visit the Installing swirl on Linux page for special instructions.

Install dplyr package and load packages

if(!require(dplyr)){
    install.packages("dplyr")
    library(dplyr)
}
library(tidyr)
library(ggplot2)
library(scales)

## set criteria applying to each code chunk.    
knitr::opts_chunk$set(comment = NA, message = F)

Data set : iris in R

# Load data
data("iris")
# Converts data to tbl class. tbl's are easier to examine than data frames. R displays only the data that fits onscreen.
tbl_df(iris)
Source: local data frame [150 x 5]

   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1           5.1         3.5          1.4         0.2  setosa
2           4.9         3.0          1.4         0.2  setosa
3           4.7         3.2          1.3         0.2  setosa
4           4.6         3.1          1.5         0.2  setosa
5           5.0         3.6          1.4         0.2  setosa
6           5.4         3.9          1.7         0.4  setosa
7           4.6         3.4          1.4         0.3  setosa
8           5.0         3.4          1.5         0.2  setosa
9           4.4         2.9          1.4         0.2  setosa
10          4.9         3.1          1.5         0.1  setosa
..          ...         ...          ...         ...     ...
str(iris)
'data.frame':   150 obs. of  5 variables:
 $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
 $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
  • Check data structure. 5 variables and 150 observations.
  • No missing values in iris data set.

Deal with the variable name.

names <- gsub("\\.","\\_", names(iris))
names(iris) <- names 
head(iris)
  Sepal_Length Sepal_Width Petal_Length Petal_Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa
table(iris$Species)

    setosa versicolor  virginica 
        50         50         50 

Examples

  • Basic verbs of data manipulating:
    • select, group_by, filter, rename, mutate, summarise, arrange.
  • Use %>% to chain codes together and make them more readable.

select columns by names.

#select three variables    
df1 <- iris %>% 
  select(Petal_Length, Petal_Width, Species)

head(df1)
  Petal_Length Petal_Width Species
1          1.4         0.2  setosa
2          1.4         0.2  setosa
3          1.3         0.2  setosa
4          1.5         0.2  setosa
5          1.4         0.2  setosa
6          1.7         0.4  setosa
# remover a variable    
df2 <- iris %>% 
  select(-Species)

head(df2)
  Sepal_Length Sepal_Width Petal_Length Petal_Width
1          5.1         3.5          1.4         0.2
2          4.9         3.0          1.4         0.2
3          4.7         3.2          1.3         0.2
4          4.6         3.1          1.5         0.2
5          5.0         3.6          1.4         0.2
6          5.4         3.9          1.7         0.4
#select the second to fourth variables by using a colon so you do not need to list each variable.            
df3 <- iris %>% 
  select(Sepal_Width:Petal_Width)

head(df3)
  Sepal_Width Petal_Length Petal_Width
1         3.5          1.4         0.2
2         3.0          1.4         0.2
3         3.2          1.3         0.2
4         3.1          1.5         0.2
5         3.6          1.4         0.2
6         3.9          1.7         0.4

group_by : group data into rows with the same value of Species.

summarise : summarise data into single row of values, such as mean or sum.

df4 <- iris %>%
  group_by(Species) %>%
  summarise(mean_Sepal_Length = mean(Sepal_Length),
            mean_Sepal_Width = mean(Sepal_Width),
            mean_Petal_Length = mean(Petal_Length),
            mean_Petal_Width = mean(Petal_Width))

df4
Source: local data frame [3 x 5]

     Species mean_Sepal_Length mean_Sepal_Width mean_Petal_Length
1     setosa             5.006            3.428             1.462
2 versicolor             5.936            2.770             4.260
3  virginica             6.588            2.974             5.552
Variables not shown: mean_Petal_Width (dbl)

gather : gather columns into rows.

gather <- df4 %>% gather(Part, Numbers, -Species) %>%
  mutate(Species = as.character(Species),
         Part = as.character(Part))
ggplot(gather , aes(x = Species, y = Numbers,fill = as.factor(Part))) +
  geom_bar(position="dodge",stat="identity", color = 'black') +
  scale_fill_brewer() +
  theme_bw() +
  guides(fill = guide_legend(title = "Part")) +
  theme(legend.position = "top") +
  geom_text(aes(label = Numbers), position = position_dodge(width = 0.9), vjust = -0.5, hjust = .5 ,size = 3)


arrange : order rows by values of a column.

filter : select a subset of the rows that meet certain criteria.

df4 %>% arrange(mean_Sepal_Length)
Source: local data frame [3 x 5]

     Species mean_Sepal_Length mean_Sepal_Width mean_Petal_Length
1     setosa             5.006            3.428             1.462
2 versicolor             5.936            2.770             4.260
3  virginica             6.588            2.974             5.552
Variables not shown: mean_Petal_Width (dbl)
## Order rows by values high to low.
df4 %>% arrange(desc(mean_Sepal_Length))
Source: local data frame [3 x 5]

     Species mean_Sepal_Length mean_Sepal_Width mean_Petal_Length
1  virginica             6.588            2.974             5.552
2 versicolor             5.936            2.770             4.260
3     setosa             5.006            3.428             1.462
Variables not shown: mean_Petal_Width (dbl)
## a subset of rows that contain two species. 
df4 %>% filter(Species == "virginica" | Species == "setosa")
Source: local data frame [2 x 5]

    Species mean_Sepal_Length mean_Sepal_Width mean_Petal_Length
1    setosa             5.006            3.428             1.462
2 virginica             6.588            2.974             5.552
Variables not shown: mean_Petal_Width (dbl)
## rename a column name from Speices to New Speices.    
df5 <- df4 %>%
  filter(Species == "virginica" | Species == "setosa") %>%
  arrange(desc(mean_Sepal_Length)) %>%
  rename(New_Species = Species)

df5
Source: local data frame [2 x 5]

  New_Species mean_Sepal_Length mean_Sepal_Width mean_Petal_Length
1   virginica             6.588            2.974             5.552
2      setosa             5.006            3.428             1.462
Variables not shown: mean_Petal_Width (dbl)

mutate : add new columns that are functions of existing columns.

head(iris)
  Sepal_Length Sepal_Width Petal_Length Petal_Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa
## create a new column that fill in the mean of Sepal Length for each species.        
Sepal_Length <- iris %>%
  group_by(Species) %>%
  mutate(Average_Sepal_Length = mean(Sepal_Length),
            Above_average = Sepal_Length > Average_Sepal_Length) %>%
  select(Species, Sepal_Length, Average_Sepal_Length, Above_average)

head(Sepal_Length)
Source: local data frame [6 x 4]
Groups: Species

  Species Sepal_Length Average_Sepal_Length Above_average
1  setosa          5.1                5.006          TRUE
2  setosa          4.9                5.006         FALSE
3  setosa          4.7                5.006         FALSE
4  setosa          4.6                5.006         FALSE
5  setosa          5.0                5.006         FALSE
6  setosa          5.4                5.006          TRUE
## see  how many species is above average Sepal Length for each group.    
Sepal_Length %>%
  group_by(Species, Above_average) %>%
  summarise(length(Species))
Source: local data frame [6 x 3]
Groups: Species

     Species Above_average length(Species)
1     setosa         FALSE              28
2     setosa          TRUE              22
3 versicolor         FALSE              26
4 versicolor          TRUE              24
5  virginica         FALSE              28
6  virginica          TRUE              22