Tutorial4

library(datasets)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

head(airquality)

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6

glimpse(airquality)

## Observations: 153
## Variables: 6
## $ Ozone   <int> 41, 36, 12, 18, NA, 28, 23, 19, 8, NA, 7, 16, 11, 14, ...
## $ Solar.R <int> 190, 118, 149, 313, NA, NA, 299, 99, 19, 194, NA, 256,...
## $ Wind    <dbl> 7.4, 8.0, 12.6, 11.5, 14.3, 14.9, 8.6, 13.8, 20.1, 8.6...
## $ Temp    <int> 67, 72, 74, 62, 56, 66, 65, 59, 61, 69, 74, 69, 66, 68...
## $ Month   <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
## $ Day     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...

#tibble
airquality_tb <- tbl_df(airquality)

## Filter##
#The filter function will return all the rows that satisfy a following condition. For example below will return all the rows where Temp is larger than 70.
filter(airquality_tb, Temp > 70)

## # A tibble: 120 x 6
##    Ozone Solar.R  Wind  Temp Month   Day
##    <int>   <int> <dbl> <int> <int> <int>
##  1    36     118   8      72     5     2
##  2    12     149  12.6    74     5     3
##  3     7      NA   6.9    74     5    11
##  4    11     320  16.6    73     5    22
##  5    45     252  14.9    81     5    29
##  6   115     223   5.7    79     5    30
##  7    37     279   7.4    76     5    31
##  8    NA     286   8.6    78     6     1
##  9    NA     287   9.7    74     6     2
## 10    NA     186   9.2    84     6     4
## # ... with 110 more rows

# Another example of filter is to return all the rows where Temp is larger than 80 and Month is after May.

filter (airquality_tb, Temp > 80 & Month > 5)

## # A tibble: 67 x 6
##    Ozone Solar.R  Wind  Temp Month   Day
##    <int>   <int> <dbl> <int> <int> <int>
##  1    NA     186   9.2    84     6     4
##  2    NA     220   8.6    85     6     5
##  3    29     127   9.7    82     6     7
##  4    NA     273   6.9    87     6     8
##  5    71     291  13.8    90     6     9
##  6    39     323  11.5    87     6    10
##  7    NA     259  10.9    93     6    11
##  8    NA     250   9.2    92     6    12
##  9    23     148   8      82     6    13
## 10    NA     138   8      83     6    30
## # ... with 57 more rows

# Mutate
# Mutate is used to add new variables to the data. For example lets adds a new column that displays the temperature in Celsius.

mutate(airquality_tb, TempInC = (Temp - 32) * 5 / 9)

## # A tibble: 153 x 7
##    Ozone Solar.R  Wind  Temp Month   Day TempInC
##    <int>   <int> <dbl> <int> <int> <int>   <dbl>
##  1    41     190   7.4    67     5     1    19.4
##  2    36     118   8      72     5     2    22.2
##  3    12     149  12.6    74     5     3    23.3
##  4    18     313  11.5    62     5     4    16.7
##  5    NA      NA  14.3    56     5     5    13.3
##  6    28      NA  14.9    66     5     6    18.9
##  7    23     299   8.6    65     5     7    18.3
##  8    19      99  13.8    59     5     8    15  
##  9     8      19  20.1    61     5     9    16.1
## 10    NA     194   8.6    69     5    10    20.6
## # ... with 143 more rows

# Summarise
# The summarise function is used to summarise multiple values into a single value. It is very powerful when used in conjunction with the other functions in the dplyr package, as demonstrated below. na.rm = TRUE will remove all NA values while calculating the mean, so that it doesn’t produce spurious results.

summarise(airquality_tb, mean(Temp, na.rm = TRUE))

## # A tibble: 1 x 1
##   `mean(Temp, na.rm = TRUE)`
##                        <dbl>
## 1                       77.9

#Group By
#The group_by function is used to group data by one or more variables. Will group the data together based on the Month, and then the summarise function is used to calculate the mean temperature in each month.

summarise(group_by(airquality_tb, Month), mean(Temp, na.rm = TRUE))

## # A tibble: 5 x 2
##   Month `mean(Temp, na.rm = TRUE)`
##   <int>                      <dbl>
## 1     5                       65.5
## 2     6                       79.1
## 3     7                       83.9
## 4     8                       84.0
## 5     9                       76.9

# Sample
# The sample function is used to select random rows from a table. The first line of code randomly selects ten rows from the dataset, and the second line of code randomly selects 15 rows (10% of the original 153 rows) from the dataset.

sample_n(airquality_tb, size = 10)

## # A tibble: 10 x 6
##    Ozone Solar.R  Wind  Temp Month   Day
##    <int>   <int> <dbl> <int> <int> <int>
##  1    44     236  14.9    81     9    11
##  2    36     118   8      72     5     2
##  3    23     148   8      82     6    13
##  4    NA     259  10.9    93     6    11
##  5     7      48  14.3    80     7    15
##  6    48     260   6.9    81     7    16
##  7   122     255   4      89     8     7
##  8    20     223  11.5    68     9    30
##  9    37     279   7.4    76     5    31
## 10    NA      NA   8      57     5    27

sample_frac(airquality_tb, size = 0.1)

## # A tibble: 15 x 6
##    Ozone Solar.R  Wind  Temp Month   Day
##    <int>   <int> <dbl> <int> <int> <int>
##  1    37     284  20.7    72     6    17
##  2    59      51   6.3    79     8    17
##  3     7      49  10.3    69     9    24
##  4    10     264  14.3    73     7    12
##  5     1       8   9.7    59     5    21
##  6    12     120  11.5    73     6    19
##  7     9      24  13.8    81     8     2
##  8    23     299   8.6    65     5     7
##  9    16      77   7.4    82     8     3
## 10    96     167   6.9    91     9     1
## 11    11     320  16.6    73     5    22
## 12    23      13  12      67     5    28
## 13   135     269   4.1    84     7     1
## 14    NA      47  10.3    73     6    27
## 15    NA     150   6.3    77     6    21

# Count
# The count function tallies observations based on a group. It is slightly similar to the table function in the base package. For example:
  
count(airquality_tb, Month)

## # A tibble: 5 x 2
##   Month     n
##   <int> <int>
## 1     5    31
## 2     6    30
## 3     7    31
## 4     8    31
## 5     9    30

# Arrange
# The arrange function is used to arrange rows by variables. Currently, the airquality dataset is arranged based on Month, and then Day. We can use the arrange function to arrange the rows in the descending order of Month, and then in the ascending order of Day.

arrange(airquality_tb, desc(Month), Day)

## # A tibble: 153 x 6
##    Ozone Solar.R  Wind  Temp Month   Day
##    <int>   <int> <dbl> <int> <int> <int>
##  1    96     167   6.9    91     9     1
##  2    78     197   5.1    92     9     2
##  3    73     183   2.8    93     9     3
##  4    91     189   4.6    93     9     4
##  5    47      95   7.4    87     9     5
##  6    32      92  15.5    84     9     6
##  7    20     252  10.9    80     9     7
##  8    23     220  10.3    78     9     8
##  9    21     230  10.9    75     9     9
## 10    24     259   9.7    73     9    10
## # ... with 143 more rows

# Pipe
# The pipe operator in R, represented by %>% can be used to chain code together. It is very useful when you are performing several operations on data, and don’t want to save the output at each intermediate step.

# For example, let’s say we want to remove all the data corresponding to Month = 5, group the data by month, and then find the mean of the temperature each month. The conventional way to write the code for this would be:
  
filteredData <- filter(airquality_tb, Month != 5)
groupedData <- group_by(filteredData, Month)
summarise(groupedData, mean(Temp, na.rm = TRUE))

## # A tibble: 4 x 2
##   Month `mean(Temp, na.rm = TRUE)`
##   <int>                      <dbl>
## 1     6                       79.1
## 2     7                       83.9
## 3     8                       84.0
## 4     9                       76.9

# With piping, the above code can be rewritten as:
  
airquality %>% 
filter(Month != 5) %>% 
group_by(Month) %>% 
summarise(mean(Temp, na.rm = TRUE))

## # A tibble: 4 x 2
##   Month `mean(Temp, na.rm = TRUE)`
##   <int>                      <dbl>
## 1     6                       79.1
## 2     7                       83.9
## 3     8                       84.0
## 4     9                       76.9

Tutorial4

Woongbae

September 27, 2018