* package that contains many functions to manipulate unstructured data.
* Developed by Hadley Wickham
* Absolutely Not.
* Base R functionality can be used.
* dplyr functions are computationally more efficient and syntactically stable -- very useful while working with dataframes
* select() : for selecting columns
* slice() : for selecting rows
* filter() : for selecting subsets of rows
* mutute() : for creating new columnsrows
* arrange(): for arranging rows
* summarize
using
group_by : For summarizing the data
#Remove the environment variable
rm(list=ls())
#Load Packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Load Compensation Data File
library(readr)
compensation = read_csv("D:\\R Course\\DataAnalysis\\Data\\datasets-master\\compensation.csv")
## Rows: 40 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Grazing
## dbl (2): Root, Fruit
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View the data loaded
View(compensation)
#Dimension of the Data File
dim(compensation)
## [1] 40 3
#Display first 6 rows of the data frame
head(compensation)
## # A tibble: 6 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 6.22 59.8 Ungrazed
## 2 6.49 61.0 Ungrazed
## 3 4.92 14.7 Ungrazed
## 4 5.13 19.3 Ungrazed
## 5 5.42 34.2 Ungrazed
## 6 5.36 35.5 Ungrazed
#Display first 6 rows of the data frame
tail(compensation)
## # A tibble: 6 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 9.35 98.5 Grazed
## 2 7.07 40.2 Grazed
## 3 8.16 52.3 Grazed
## 4 7.38 46.6 Grazed
## 5 8.52 71.0 Grazed
## 6 8.53 83.0 Grazed
#Column names of the data file
names(compensation)
## [1] "Root" "Fruit" "Grazing"
Displays the variables/columns of the dataset
Used for sub-setting data by selecting variables
Syntax of select()
Takes dataframe as input and produce datframe as output
Different options:
Using pipe operator (%>%) multiple operations could be done sequentially from left to right. The pipe operator is defined in magrittr package which is installed automatically while installing dplyr package.
#Example-1: Select variables by name
select(compensation,Fruit,Grazing)
## # A tibble: 40 × 2
## Fruit Grazing
## <dbl> <chr>
## 1 59.8 Ungrazed
## 2 61.0 Ungrazed
## 3 14.7 Ungrazed
## 4 19.3 Ungrazed
## 5 34.2 Ungrazed
## 6 35.5 Ungrazed
## 7 87.7 Ungrazed
## 8 63.2 Ungrazed
## 9 24.2 Ungrazed
## 10 64.3 Ungrazed
## # ℹ 30 more rows
names(compensation)
## [1] "Root" "Fruit" "Grazing"
select(compensation,Fruit,Grazing)
## # A tibble: 40 × 2
## Fruit Grazing
## <dbl> <chr>
## 1 59.8 Ungrazed
## 2 61.0 Ungrazed
## 3 14.7 Ungrazed
## 4 19.3 Ungrazed
## 5 34.2 Ungrazed
## 6 35.5 Ungrazed
## 7 87.7 Ungrazed
## 8 63.2 Ungrazed
## 9 24.2 Ungrazed
## 10 64.3 Ungrazed
## # ℹ 30 more rows
#Using pipe operator
compensation%>% select(Fruit,Grazing)
## # A tibble: 40 × 2
## Fruit Grazing
## <dbl> <chr>
## 1 59.8 Ungrazed
## 2 61.0 Ungrazed
## 3 14.7 Ungrazed
## 4 19.3 Ungrazed
## 5 34.2 Ungrazed
## 6 35.5 Ungrazed
## 7 87.7 Ungrazed
## 8 63.2 Ungrazed
## 9 24.2 Ungrazed
## 10 64.3 Ungrazed
## # ℹ 30 more rows
#Example-2: Select variables by index position
select(compensation,c(1,3))
## # A tibble: 40 × 2
## Root Grazing
## <dbl> <chr>
## 1 6.22 Ungrazed
## 2 6.49 Ungrazed
## 3 4.92 Ungrazed
## 4 5.13 Ungrazed
## 5 5.42 Ungrazed
## 6 5.36 Ungrazed
## 7 7.61 Ungrazed
## 8 6.35 Ungrazed
## 9 4.97 Ungrazed
## 10 6.93 Ungrazed
## # ℹ 30 more rows
#Using pipe operator
compensation%>%select(1, 3)
## # A tibble: 40 × 2
## Root Grazing
## <dbl> <chr>
## 1 6.22 Ungrazed
## 2 6.49 Ungrazed
## 3 4.92 Ungrazed
## 4 5.13 Ungrazed
## 5 5.42 Ungrazed
## 6 5.36 Ungrazed
## 7 7.61 Ungrazed
## 8 6.35 Ungrazed
## 9 4.97 Ungrazed
## 10 6.93 Ungrazed
## # ℹ 30 more rows
#selecting by index range
compensation%>%select(1:3)
## # A tibble: 40 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 6.22 59.8 Ungrazed
## 2 6.49 61.0 Ungrazed
## 3 4.92 14.7 Ungrazed
## 4 5.13 19.3 Ungrazed
## 5 5.42 34.2 Ungrazed
## 6 5.36 35.5 Ungrazed
## 7 7.61 87.7 Ungrazed
## 8 6.35 63.2 Ungrazed
## 9 4.97 24.2 Ungrazed
## 10 6.93 64.3 Ungrazed
## # ℹ 30 more rows
#Example-3: Deselect/Drop the variables
compensation%>% select(!Fruit)
## # A tibble: 40 × 2
## Root Grazing
## <dbl> <chr>
## 1 6.22 Ungrazed
## 2 6.49 Ungrazed
## 3 4.92 Ungrazed
## 4 5.13 Ungrazed
## 5 5.42 Ungrazed
## 6 5.36 Ungrazed
## 7 7.61 Ungrazed
## 8 6.35 Ungrazed
## 9 4.97 Ungrazed
## 10 6.93 Ungrazed
## # ℹ 30 more rows
compensation%>% select(!3)
## # A tibble: 40 × 2
## Root Fruit
## <dbl> <dbl>
## 1 6.22 59.8
## 2 6.49 61.0
## 3 4.92 14.7
## 4 5.13 19.3
## 5 5.42 34.2
## 6 5.36 35.5
## 7 7.61 87.7
## 8 6.35 63.2
## 9 4.97 24.2
## 10 6.93 64.3
## # ℹ 30 more rows
#Example 4: Select variables based on some pattern
compensation%>% select(starts_with("Fr"))
## # A tibble: 40 × 1
## Fruit
## <dbl>
## 1 59.8
## 2 61.0
## 3 14.7
## 4 19.3
## 5 34.2
## 6 35.5
## 7 87.7
## 8 63.2
## 9 24.2
## 10 64.3
## # ℹ 30 more rows
compensation%>% select(ends_with("t"))
## # A tibble: 40 × 2
## Root Fruit
## <dbl> <dbl>
## 1 6.22 59.8
## 2 6.49 61.0
## 3 4.92 14.7
## 4 5.13 19.3
## 5 5.42 34.2
## 6 5.36 35.5
## 7 7.61 87.7
## 8 6.35 63.2
## 9 4.97 24.2
## 10 6.93 64.3
## # ℹ 30 more rows
compensation%>% select(contains("a"))
## # A tibble: 40 × 1
## Grazing
## <chr>
## 1 Ungrazed
## 2 Ungrazed
## 3 Ungrazed
## 4 Ungrazed
## 5 Ungrazed
## 6 Ungrazed
## 7 Ungrazed
## 8 Ungrazed
## 9 Ungrazed
## 10 Ungrazed
## # ℹ 30 more rows
compensation%>% select(starts_with("Fr") & ends_with("t"))
## # A tibble: 40 × 1
## Fruit
## <dbl>
## 1 59.8
## 2 61.0
## 3 14.7
## 4 19.3
## 5 34.2
## 6 35.5
## 7 87.7
## 8 63.2
## 9 24.2
## 10 64.3
## # ℹ 30 more rows
compensation%>% select(starts_with("Fr") | ends_with("t"))
## # A tibble: 40 × 2
## Fruit Root
## <dbl> <dbl>
## 1 59.8 6.22
## 2 61.0 6.49
## 3 14.7 4.92
## 4 19.3 5.13
## 5 34.2 5.42
## 6 35.5 5.36
## 7 87.7 7.61
## 8 63.2 6.35
## 9 24.2 4.97
## 10 64.3 6.93
## # ℹ 30 more rows
#Example 5: Select all numeric variables
compensation%>%select_if(is.numeric)
## # A tibble: 40 × 2
## Root Fruit
## <dbl> <dbl>
## 1 6.22 59.8
## 2 6.49 61.0
## 3 4.92 14.7
## 4 5.13 19.3
## 5 5.42 34.2
## 6 5.36 35.5
## 7 7.61 87.7
## 8 6.35 63.2
## 9 4.97 24.2
## 10 6.93 64.3
## # ℹ 30 more rows
#select the first rows
compensation%>%slice_head()
## # A tibble: 1 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 6.22 59.8 Ungrazed
compensation%>%slice_head(n=5)
## # A tibble: 5 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 6.22 59.8 Ungrazed
## 2 6.49 61.0 Ungrazed
## 3 4.92 14.7 Ungrazed
## 4 5.13 19.3 Ungrazed
## 5 5.42 34.2 Ungrazed
#select the last rows
compensation%>%slice_tail()
## # A tibble: 1 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 8.53 83.0 Grazed
compensation%>%slice_tail(n=5)
## # A tibble: 5 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 7.07 40.2 Grazed
## 2 8.16 52.3 Grazed
## 3 7.38 46.6 Grazed
## 4 8.52 71.0 Grazed
## 5 8.53 83.0 Grazed
# selects rows randomly with or without replacement
compensation%>%slice_sample(n=5)
## # A tibble: 5 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 7.92 41.5 Grazed
## 2 5.93 54.9 Ungrazed
## 3 5.84 46.7 Ungrazed
## 4 8.53 83.0 Grazed
## 5 6.93 64.3 Ungrazed
compensation%>%slice_sample(n=5,replace = TRUE)
## # A tibble: 5 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 8.91 70.1 Grazed
## 2 6.01 53.6 Ungrazed
## 3 7.69 70.7 Grazed
## 4 5.13 19.3 Ungrazed
## 5 7.35 50.1 Grazed
# selects rows based on the minimum or maximum value of any variable
compensation%>%slice_min(Fruit,n=5)
## # A tibble: 5 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 4.92 14.7 Ungrazed
## 2 6.11 15.0 Grazed
## 3 4.43 18.9 Ungrazed
## 4 5.13 19.3 Ungrazed
## 5 4.97 24.2 Ungrazed
compensation%>%slice_max(Fruit,n=5)
## # A tibble: 5 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 10.3 116. Grazed
## 2 9.84 105. Grazed
## 3 9.35 98.5 Grazed
## 4 7.61 87.7 Ungrazed
## 5 9.04 84.4 Grazed
#Select a specific row, sequence of rows, discontinuous set of rows from the data frame
compensation%>%slice(2)
## # A tibble: 1 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 6.49 61.0 Ungrazed
compensation%>%slice(2:6)
## # A tibble: 5 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 6.49 61.0 Ungrazed
## 2 4.92 14.7 Ungrazed
## 3 5.13 19.3 Ungrazed
## 4 5.42 34.2 Ungrazed
## 5 5.36 35.5 Ungrazed
compensation%>%slice(2,6,9)
## # A tibble: 3 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 6.49 61.0 Ungrazed
## 2 5.36 35.5 Ungrazed
## 3 4.97 24.2 Ungrazed
#Dropping rows
compensation%>%slice(-(3:8))
## # A tibble: 34 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 6.22 59.8 Ungrazed
## 2 6.49 61.0 Ungrazed
## 3 4.97 24.2 Ungrazed
## 4 6.93 64.3 Ungrazed
## 5 6.25 52.9 Ungrazed
## 6 5.45 32.4 Ungrazed
## 7 6.01 53.6 Ungrazed
## 8 5.93 54.9 Ungrazed
## 9 6.26 64.8 Ungrazed
## 10 7.18 73.2 Ungrazed
## # ℹ 24 more rows
Logical Operator | Meaning | Third Header |
---|---|---|
== | Equals | finds rows where condition is met. |
!= | Doesn’t Equal | finds rows where condition is NOT met. |
<,>,<=,>= | Relational | finds rows where relation is TRUE. |
OR | ||
& | AND |
`
#Filter based on logical operator
compensation%>%filter(Fruit>80)
## # A tibble: 9 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 7.61 87.7 Ungrazed
## 2 7.00 80.6 Ungrazed
## 3 10.3 116. Grazed
## 4 9.04 84.4 Grazed
## 5 8.99 80.3 Grazed
## 6 8.98 82.4 Grazed
## 7 9.84 105. Grazed
## 8 9.35 98.5 Grazed
## 9 8.53 83.0 Grazed
compensation%>%filter(Fruit>80 | Fruit<20)
## # A tibble: 13 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 4.92 14.7 Ungrazed
## 2 5.13 19.3 Ungrazed
## 3 7.61 87.7 Ungrazed
## 4 7.00 80.6 Ungrazed
## 5 4.43 18.9 Ungrazed
## 6 10.3 116. Grazed
## 7 9.04 84.4 Grazed
## 8 6.11 15.0 Grazed
## 9 8.99 80.3 Grazed
## 10 8.98 82.4 Grazed
## 11 9.84 105. Grazed
## 12 9.35 98.5 Grazed
## 13 8.53 83.0 Grazed
compensation%>%filter(Fruit>mean(Fruit))
## # A tibble: 22 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 6.22 59.8 Ungrazed
## 2 6.49 61.0 Ungrazed
## 3 7.61 87.7 Ungrazed
## 4 6.35 63.2 Ungrazed
## 5 6.93 64.3 Ungrazed
## 6 6.26 64.8 Ungrazed
## 7 7.18 73.2 Ungrazed
## 8 7.00 80.6 Ungrazed
## 9 7.30 75.5 Ungrazed
## 10 10.3 116. Grazed
## # ℹ 12 more rows
compensation%>%filter(Fruit>80 & Grazing=="Ungrazed")
## # A tibble: 2 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 7.61 87.7 Ungrazed
## 2 7.00 80.6 Ungrazed
compensation%>%mutate(frt_rt=Fruit/Root)
## # A tibble: 40 × 4
## Root Fruit Grazing frt_rt
## <dbl> <dbl> <chr> <dbl>
## 1 6.22 59.8 Ungrazed 9.60
## 2 6.49 61.0 Ungrazed 9.40
## 3 4.92 14.7 Ungrazed 2.99
## 4 5.13 19.3 Ungrazed 3.76
## 5 5.42 34.2 Ungrazed 6.32
## 6 5.36 35.5 Ungrazed 6.63
## 7 7.61 87.7 Ungrazed 11.5
## 8 6.35 63.2 Ungrazed 9.95
## 9 4.97 24.2 Ungrazed 4.87
## 10 6.93 64.3 Ungrazed 9.28
## # ℹ 30 more rows
compensation%>%mutate(logfrt=log(Fruit))
## # A tibble: 40 × 4
## Root Fruit Grazing logfrt
## <dbl> <dbl> <chr> <dbl>
## 1 6.22 59.8 Ungrazed 4.09
## 2 6.49 61.0 Ungrazed 4.11
## 3 4.92 14.7 Ungrazed 2.69
## 4 5.13 19.3 Ungrazed 2.96
## 5 5.42 34.2 Ungrazed 3.53
## 6 5.36 35.5 Ungrazed 3.57
## 7 7.61 87.7 Ungrazed 4.47
## 8 6.35 63.2 Ungrazed 4.15
## 9 4.97 24.2 Ungrazed 3.19
## 10 6.93 64.3 Ungrazed 4.16
## # ℹ 30 more rows
#By default sorting is in ascending order
compensation%>%arrange(Fruit)
## # A tibble: 40 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 4.92 14.7 Ungrazed
## 2 6.11 15.0 Grazed
## 3 4.43 18.9 Ungrazed
## 4 5.13 19.3 Ungrazed
## 5 4.97 24.2 Ungrazed
## 6 5.45 32.4 Ungrazed
## 7 5.42 34.2 Ungrazed
## 8 5.36 35.5 Ungrazed
## 9 6.96 38.9 Grazed
## 10 7.07 40.2 Grazed
## # ℹ 30 more rows
#The order can be changed
compensation%>%arrange(desc(Fruit))
## # A tibble: 40 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 10.3 116. Grazed
## 2 9.84 105. Grazed
## 3 9.35 98.5 Grazed
## 4 7.61 87.7 Ungrazed
## 5 9.04 84.4 Grazed
## 6 8.53 83.0 Grazed
## 7 8.98 82.4 Grazed
## 8 7.00 80.6 Ungrazed
## 9 8.99 80.3 Grazed
## 10 8.64 78.3 Grazed
## # ℹ 30 more rows
#Sorting based on multiple variables
compensation%>%arrange(Fruit,desc(Root))
## # A tibble: 40 × 3
## Root Fruit Grazing
## <dbl> <dbl> <chr>
## 1 4.92 14.7 Ungrazed
## 2 6.11 15.0 Grazed
## 3 4.43 18.9 Ungrazed
## 4 5.13 19.3 Ungrazed
## 5 4.97 24.2 Ungrazed
## 6 5.45 32.4 Ungrazed
## 7 5.42 34.2 Ungrazed
## 8 5.36 35.5 Ungrazed
## 9 6.96 38.9 Grazed
## 10 7.07 40.2 Grazed
## # ℹ 30 more rows
Summarization is accomplished in a series of steps. They are:
compensation%>%group_by(Grazing)%>%summarize(mean_fruit=mean(Fruit))
## # A tibble: 2 × 2
## Grazing mean_fruit
## <chr> <dbl>
## 1 Grazed 67.9
## 2 Ungrazed 50.9
compensation%>%group_by(Grazing)%>%summarize(mean_fruit=mean(Fruit),sd_fruit=sd(Fruit))
## # A tibble: 2 × 3
## Grazing mean_fruit sd_fruit
## <chr> <dbl> <dbl>
## 1 Grazed 67.9 25.0
## 2 Ungrazed 50.9 21.8
Operation | dplyr Function | Base R Command |
---|---|---|
select some rows | comepensation>%>slice(2,6,8) | compensation[c(2,6,8),] |
select some columns | comepensation>%>select(Root,Fruit) | compensation[,c(“Root”,“Fruit”)] |
subset | comepensation>%>filter(Fruit>80) | compensation[compensation$Fruit>80,] |
Ordering | comepensation>%>arrange(Fruit) | compensation[order(compensation$Fruit),] |
Add a column | compensation>%>mutate(frt_rt=Fruit/Root) | compensation$frt_rt=compensation$Fruit/compensation$Root |
summarize data | compensation>%>group_by(Grazing)>%>%summarize(mean_fruit=mean(Fruit)) | tapply(compensation$Fruit,list(compensation$Grazing),mean) |