library(datasets)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
glimpse(airquality)
## Observations: 153
## Variables: 6
## $ Ozone <int> 41, 36, 12, 18, NA, 28, 23, 19, 8, NA, 7, 16, 11, 14, ...
## $ Solar.R <int> 190, 118, 149, 313, NA, NA, 299, 99, 19, 194, NA, 256,...
## $ Wind <dbl> 7.4, 8.0, 12.6, 11.5, 14.3, 14.9, 8.6, 13.8, 20.1, 8.6...
## $ Temp <int> 67, 72, 74, 62, 56, 66, 65, 59, 61, 69, 74, 69, 66, 68...
## $ Month <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
## $ Day <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
#tibble
airquality_tb <- tbl_df(airquality)
## Filter##
#The filter function will return all the rows that satisfy a following condition. For example below will return all the rows where Temp is larger than 70.
filter(airquality_tb, Temp > 70)
## # A tibble: 120 x 6
## Ozone Solar.R Wind Temp Month Day
## <int> <int> <dbl> <int> <int> <int>
## 1 36 118 8 72 5 2
## 2 12 149 12.6 74 5 3
## 3 7 NA 6.9 74 5 11
## 4 11 320 16.6 73 5 22
## 5 45 252 14.9 81 5 29
## 6 115 223 5.7 79 5 30
## 7 37 279 7.4 76 5 31
## 8 NA 286 8.6 78 6 1
## 9 NA 287 9.7 74 6 2
## 10 NA 186 9.2 84 6 4
## # ... with 110 more rows
# Another example of filter is to return all the rows where Temp is larger than 80 and Month is after May.
filter (airquality_tb, Temp > 80 & Month > 5)
## # A tibble: 67 x 6
## Ozone Solar.R Wind Temp Month Day
## <int> <int> <dbl> <int> <int> <int>
## 1 NA 186 9.2 84 6 4
## 2 NA 220 8.6 85 6 5
## 3 29 127 9.7 82 6 7
## 4 NA 273 6.9 87 6 8
## 5 71 291 13.8 90 6 9
## 6 39 323 11.5 87 6 10
## 7 NA 259 10.9 93 6 11
## 8 NA 250 9.2 92 6 12
## 9 23 148 8 82 6 13
## 10 NA 138 8 83 6 30
## # ... with 57 more rows
# Mutate
# Mutate is used to add new variables to the data. For example lets adds a new column that displays the temperature in Celsius.
mutate(airquality_tb, TempInC = (Temp - 32) * 5 / 9)
## # A tibble: 153 x 7
## Ozone Solar.R Wind Temp Month Day TempInC
## <int> <int> <dbl> <int> <int> <int> <dbl>
## 1 41 190 7.4 67 5 1 19.4
## 2 36 118 8 72 5 2 22.2
## 3 12 149 12.6 74 5 3 23.3
## 4 18 313 11.5 62 5 4 16.7
## 5 NA NA 14.3 56 5 5 13.3
## 6 28 NA 14.9 66 5 6 18.9
## 7 23 299 8.6 65 5 7 18.3
## 8 19 99 13.8 59 5 8 15
## 9 8 19 20.1 61 5 9 16.1
## 10 NA 194 8.6 69 5 10 20.6
## # ... with 143 more rows
# Summarise
# The summarise function is used to summarise multiple values into a single value. It is very powerful when used in conjunction with the other functions in the dplyr package, as demonstrated below. na.rm = TRUE will remove all NA values while calculating the mean, so that it doesn’t produce spurious results.
summarise(airquality_tb, mean(Temp, na.rm = TRUE))
## # A tibble: 1 x 1
## `mean(Temp, na.rm = TRUE)`
## <dbl>
## 1 77.9
#Group By
#The group_by function is used to group data by one or more variables. Will group the data together based on the Month, and then the summarise function is used to calculate the mean temperature in each month.
summarise(group_by(airquality_tb, Month), mean(Temp, na.rm = TRUE))
## # A tibble: 5 x 2
## Month `mean(Temp, na.rm = TRUE)`
## <int> <dbl>
## 1 5 65.5
## 2 6 79.1
## 3 7 83.9
## 4 8 84.0
## 5 9 76.9
# Sample
# The sample function is used to select random rows from a table. The first line of code randomly selects ten rows from the dataset, and the second line of code randomly selects 15 rows (10% of the original 153 rows) from the dataset.
sample_n(airquality_tb, size = 10)
## # A tibble: 10 x 6
## Ozone Solar.R Wind Temp Month Day
## <int> <int> <dbl> <int> <int> <int>
## 1 44 236 14.9 81 9 11
## 2 36 118 8 72 5 2
## 3 23 148 8 82 6 13
## 4 NA 259 10.9 93 6 11
## 5 7 48 14.3 80 7 15
## 6 48 260 6.9 81 7 16
## 7 122 255 4 89 8 7
## 8 20 223 11.5 68 9 30
## 9 37 279 7.4 76 5 31
## 10 NA NA 8 57 5 27
sample_frac(airquality_tb, size = 0.1)
## # A tibble: 15 x 6
## Ozone Solar.R Wind Temp Month Day
## <int> <int> <dbl> <int> <int> <int>
## 1 37 284 20.7 72 6 17
## 2 59 51 6.3 79 8 17
## 3 7 49 10.3 69 9 24
## 4 10 264 14.3 73 7 12
## 5 1 8 9.7 59 5 21
## 6 12 120 11.5 73 6 19
## 7 9 24 13.8 81 8 2
## 8 23 299 8.6 65 5 7
## 9 16 77 7.4 82 8 3
## 10 96 167 6.9 91 9 1
## 11 11 320 16.6 73 5 22
## 12 23 13 12 67 5 28
## 13 135 269 4.1 84 7 1
## 14 NA 47 10.3 73 6 27
## 15 NA 150 6.3 77 6 21
# Count
# The count function tallies observations based on a group. It is slightly similar to the table function in the base package. For example:
count(airquality_tb, Month)
## # A tibble: 5 x 2
## Month n
## <int> <int>
## 1 5 31
## 2 6 30
## 3 7 31
## 4 8 31
## 5 9 30
# Arrange
# The arrange function is used to arrange rows by variables. Currently, the airquality dataset is arranged based on Month, and then Day. We can use the arrange function to arrange the rows in the descending order of Month, and then in the ascending order of Day.
arrange(airquality_tb, desc(Month), Day)
## # A tibble: 153 x 6
## Ozone Solar.R Wind Temp Month Day
## <int> <int> <dbl> <int> <int> <int>
## 1 96 167 6.9 91 9 1
## 2 78 197 5.1 92 9 2
## 3 73 183 2.8 93 9 3
## 4 91 189 4.6 93 9 4
## 5 47 95 7.4 87 9 5
## 6 32 92 15.5 84 9 6
## 7 20 252 10.9 80 9 7
## 8 23 220 10.3 78 9 8
## 9 21 230 10.9 75 9 9
## 10 24 259 9.7 73 9 10
## # ... with 143 more rows
# Pipe
# The pipe operator in R, represented by %>% can be used to chain code together. It is very useful when you are performing several operations on data, and don’t want to save the output at each intermediate step.
# For example, let’s say we want to remove all the data corresponding to Month = 5, group the data by month, and then find the mean of the temperature each month. The conventional way to write the code for this would be:
filteredData <- filter(airquality_tb, Month != 5)
groupedData <- group_by(filteredData, Month)
summarise(groupedData, mean(Temp, na.rm = TRUE))
## # A tibble: 4 x 2
## Month `mean(Temp, na.rm = TRUE)`
## <int> <dbl>
## 1 6 79.1
## 2 7 83.9
## 3 8 84.0
## 4 9 76.9
# With piping, the above code can be rewritten as:
airquality %>%
filter(Month != 5) %>%
group_by(Month) %>%
summarise(mean(Temp, na.rm = TRUE))
## # A tibble: 4 x 2
## Month `mean(Temp, na.rm = TRUE)`
## <int> <dbl>
## 1 6 79.1
## 2 7 83.9
## 3 8 84.0
## 4 9 76.9