Classwork_Data Wrangling

General Coding Keyboard Shortcut Tips

Run code in RStudio: command + enter

Jump between apps: command + tab

Selecting more than one file: click first file, hold shift, click last file

Delete a file: command + delete

for ggplot2, %>% for dplyr

# load packages
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(nycflights13)

The pipe operator ( %>% )

Allows us to combine multiple operations in R into a single sequential chain of actions (ModernDive).

Keyboard Shortcute: command + shift + m

# Creates a new data-set from the flights data-frame, with only the carriers AS.
alaska_flights <- flights %>% 
  filter(carrier == "AS")

Filter

Allows you to specify criteria about the values of a variable in your data-set and then filters out only the rows that match that criteria (ModernDive).

# Creates a new data-set by filtering for flights with the destination PDX. 
portland_flights <- flights %>% 
  filter(dest == "PDX")
 #View(portland_flights)

# Filter example with more complicated operators. Filter for rows that departed from JFK and were headed to Burlington, VT, or Seattle, WA. 
btv_sea_flights_fall <- flights %>% 
  filter(origin == "JFK" & (dest == "BTV" | dest == "SEA") & month >= 10)
# View(btv_sea_flights_fall)

# Same example as above, but with commas.
btv_sea_flights_fall <- flights %>% 
  filter(origin == "JFK", (dest == "BTV" | dest == "SEA"), month >= 10)
# View(btv_sea_flights_fall)

# Filter example with not operator (!=). Flights that didn't go to Seattle, WA, or Burlington, VT. 
not_BTV_SEA <- flights %>% 
  filter(!(dest == "BTV" | dest == "SEA"))
# View(not_BTV_SEA)

# Filter example with a larger number of airports, OPTION 1. Using the | (or) operator. 
many_airports <- flights %>% 
  filter(dest == "SEA" | dest == "SFO" | dest == "PDX" | 
         dest == "BTV" | dest == "BDL")

# Same as above, OPTION 2. Using %in% operator with c() function. Requires less energy to code. 
many_airports <- flights %>% 
  filter(dest %in% c("SEA", "SFO", "PDX", "BTV", "BDL"))
# View(many_airports)

summary() function

Takes many values and returns one –> mean(), median(), min(), max(), sd(), etc.

# Need to ignore/remove any NA missing values, so set the rm or remove argument to TRUE.
# REMEMBER: There are ramifications to doing this (i.e., sweeping missing values under the rug).
summary_temp <- weather %>% 
  summarize(mean = mean(temp, na.rm = TRUE), 
            std_dev = sd(temp, na.rm = TRUE))
summary_temp

## # A tibble: 1 x 2
##    mean std_dev
##   <dbl>   <dbl>
## 1  55.3    17.8

NOTES: - group_by does not change the original data-frame by itself, it simply changes the metadata (data about the data). - When combined with summarize(), group_by() can change the original data-frame.

# Computes the mean temperature, split by month, as well as the standard deviation. 
summary_monthly_temp <- weather %>% 
  group_by(month) %>% 
  summarize(mean = mean(temp, na.rm = TRUE), 
            std_dev = sd(temp, na.rm = TRUE))
summary_monthly_temp

## # A tibble: 12 x 3
##    month  mean std_dev
##  * <int> <dbl>   <dbl>
##  1     1  35.6   10.2 
##  2     2  34.3    6.98
##  3     3  39.9    6.25
##  4     4  51.7    8.79
##  5     5  61.8    9.68
##  6     6  72.2    7.55
##  7     7  80.1    7.12
##  8     8  74.5    5.19
##  9     9  67.4    8.47
## 10    10  60.1    8.85
## 11    11  45.0   10.4 
## 12    12  38.4    9.98

# Example of metadata. 
diamonds

## # A tibble: 53,940 x 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
##  2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
##  5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
##  6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  7 0.24  Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  8 0.26  Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  9 0.22  Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
## 10 0.23  Very Good H     VS1      59.4    61   338  4     4.05  2.39
## # … with 53,930 more rows

# The metadata has been grouped based on the 5 types of cut. But: the data HAS NOT been changed. 
diamonds %>% 
  group_by(cut)

## # A tibble: 53,940 x 10
## # Groups:   cut [5]
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
##  2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
##  5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
##  6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  7 0.24  Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  8 0.26  Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  9 0.22  Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
## 10 0.23  Very Good H     VS1      59.4    61   338  4     4.05  2.39
## # … with 53,930 more rows

# Grouped by cut, average price by cut. Data HAS been changed. 
diamonds %>% 
  group_by(cut) %>% 
  summarize(avg_price = mean(price))

## # A tibble: 5 x 2
##   cut       avg_price
## * <ord>         <dbl>
## 1 Fair          4359.
## 2 Good          3929.
## 3 Very Good     3982.
## 4 Premium       4584.
## 5 Ideal         3458.

# Use ungroup() to get rid of the grouping structure for the metadata. 
diamonds %>% 
  group_by(cut) %>% 
  ungroup()

## # A tibble: 53,940 x 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
##  2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
##  5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
##  6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  7 0.24  Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  8 0.26  Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  9 0.22  Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
## 10 0.23  Very Good H     VS1      59.4    61   338  4     4.05  2.39
## # … with 53,930 more rows

# n() counts rows. sum() returns the summ of numerical variables. 
by_origin <- flights %>% 
  group_by(origin) %>% 
  summarize(count = n())
by_origin

## # A tibble: 3 x 2
##   origin  count
## * <chr>   <int>
## 1 EWR    120835
## 2 JFK    111279
## 3 LGA    104662

# Grouping by more than one variable. 
# To group by 1+ variable, you need to include all the variables at the same time in group_by() with a comma. Otherwise, with them being separate, one might override the others.  
by_origin_monthly <- flights %>% 
  group_by(origin, month) %>% 
  summarize(count = n())

## `summarise()` has grouped output by 'origin'. You can override using the `.groups` argument.

by_origin_monthly

## # A tibble: 36 x 3
## # Groups:   origin [3]
##    origin month count
##    <chr>  <int> <int>
##  1 EWR        1  9893
##  2 EWR        2  9107
##  3 EWR        3 10420
##  4 EWR        4 10531
##  5 EWR        5 10592
##  6 EWR        6 10175
##  7 EWR        7 10475
##  8 EWR        8 10359
##  9 EWR        9  9550
## 10 EWR       10 10104
## # … with 26 more rows

Classwork_Data Wrangling

Molly Fischer

10/7/2021

General Coding Keyboard Shortcut Tips

The pipe operator ( %>% )

Filter

summary() function