Run code in RStudio: command + enter
Jump between apps: command + tab
Selecting more than one file: click first file, hold shift, click last file
Delete a file: command + delete
# load packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(nycflights13)
Allows us to combine multiple operations in R into a single sequential chain of actions (ModernDive).
Keyboard Shortcute: command + shift + m
# Creates a new data-set from the flights data-frame, with only the carriers AS.
alaska_flights <- flights %>%
filter(carrier == "AS")
Allows you to specify criteria about the values of a variable in your data-set and then filters out only the rows that match that criteria (ModernDive).
# Creates a new data-set by filtering for flights with the destination PDX.
portland_flights <- flights %>%
filter(dest == "PDX")
#View(portland_flights)
# Filter example with more complicated operators. Filter for rows that departed from JFK and were headed to Burlington, VT, or Seattle, WA.
btv_sea_flights_fall <- flights %>%
filter(origin == "JFK" & (dest == "BTV" | dest == "SEA") & month >= 10)
# View(btv_sea_flights_fall)
# Same example as above, but with commas.
btv_sea_flights_fall <- flights %>%
filter(origin == "JFK", (dest == "BTV" | dest == "SEA"), month >= 10)
# View(btv_sea_flights_fall)
# Filter example with not operator (!=). Flights that didn't go to Seattle, WA, or Burlington, VT.
not_BTV_SEA <- flights %>%
filter(!(dest == "BTV" | dest == "SEA"))
# View(not_BTV_SEA)
# Filter example with a larger number of airports, OPTION 1. Using the | (or) operator.
many_airports <- flights %>%
filter(dest == "SEA" | dest == "SFO" | dest == "PDX" |
dest == "BTV" | dest == "BDL")
# Same as above, OPTION 2. Using %in% operator with c() function. Requires less energy to code.
many_airports <- flights %>%
filter(dest %in% c("SEA", "SFO", "PDX", "BTV", "BDL"))
# View(many_airports)
Takes many values and returns one –> mean(), median(), min(), max(), sd(), etc.
# Need to ignore/remove any NA missing values, so set the rm or remove argument to TRUE.
# REMEMBER: There are ramifications to doing this (i.e., sweeping missing values under the rug).
summary_temp <- weather %>%
summarize(mean = mean(temp, na.rm = TRUE),
std_dev = sd(temp, na.rm = TRUE))
summary_temp
## # A tibble: 1 x 2
## mean std_dev
## <dbl> <dbl>
## 1 55.3 17.8
NOTES: - group_by does not change the original data-frame by itself, it simply changes the metadata (data about the data). - When combined with summarize(), group_by() can change the original data-frame.
# Computes the mean temperature, split by month, as well as the standard deviation.
summary_monthly_temp <- weather %>%
group_by(month) %>%
summarize(mean = mean(temp, na.rm = TRUE),
std_dev = sd(temp, na.rm = TRUE))
summary_monthly_temp
## # A tibble: 12 x 3
## month mean std_dev
## * <int> <dbl> <dbl>
## 1 1 35.6 10.2
## 2 2 34.3 6.98
## 3 3 39.9 6.25
## 4 4 51.7 8.79
## 5 5 61.8 9.68
## 6 6 72.2 7.55
## 7 7 80.1 7.12
## 8 8 74.5 5.19
## 9 9 67.4 8.47
## 10 10 60.1 8.85
## 11 11 45.0 10.4
## 12 12 38.4 9.98
# Example of metadata.
diamonds
## # A tibble: 53,940 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## # … with 53,930 more rows
# The metadata has been grouped based on the 5 types of cut. But: the data HAS NOT been changed.
diamonds %>%
group_by(cut)
## # A tibble: 53,940 x 10
## # Groups: cut [5]
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## # … with 53,930 more rows
# Grouped by cut, average price by cut. Data HAS been changed.
diamonds %>%
group_by(cut) %>%
summarize(avg_price = mean(price))
## # A tibble: 5 x 2
## cut avg_price
## * <ord> <dbl>
## 1 Fair 4359.
## 2 Good 3929.
## 3 Very Good 3982.
## 4 Premium 4584.
## 5 Ideal 3458.
# Use ungroup() to get rid of the grouping structure for the metadata.
diamonds %>%
group_by(cut) %>%
ungroup()
## # A tibble: 53,940 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## # … with 53,930 more rows
# n() counts rows. sum() returns the summ of numerical variables.
by_origin <- flights %>%
group_by(origin) %>%
summarize(count = n())
by_origin
## # A tibble: 3 x 2
## origin count
## * <chr> <int>
## 1 EWR 120835
## 2 JFK 111279
## 3 LGA 104662
# Grouping by more than one variable.
# To group by 1+ variable, you need to include all the variables at the same time in group_by() with a comma. Otherwise, with them being separate, one might override the others.
by_origin_monthly <- flights %>%
group_by(origin, month) %>%
summarize(count = n())
## `summarise()` has grouped output by 'origin'. You can override using the `.groups` argument.
by_origin_monthly
## # A tibble: 36 x 3
## # Groups: origin [3]
## origin month count
## <chr> <int> <int>
## 1 EWR 1 9893
## 2 EWR 2 9107
## 3 EWR 3 10420
## 4 EWR 4 10531
## 5 EWR 5 10592
## 6 EWR 6 10175
## 7 EWR 7 10475
## 8 EWR 8 10359
## 9 EWR 9 9550
## 10 EWR 10 10104
## # … with 26 more rows