R assignment

R Assignment

Book: https://r4ds.hadley.nz/

Load libraries

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(palmerpenguins)
library(ggthemes)

Workflow: basics

1 + 2
[1] 3
1 / 200 * 30
[1] 0.15
(59 + 73 + 2) / 3
[1] 44.66667
sin(pi / 2)
[1] 1
primes <- c(2, 3, 5, 7, 11, 13)

primes * 2
[1]  4  6 10 14 22 26
this_is_a_really_long_name <- 2.5
r_rocks <- 2^3
r_rocks
[1] 8

functions

seq(from = 1, to = 10)
 [1]  1  2  3  4  5  6  7  8  9 10
seq(1, 30)
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
[26] 26 27 28 29 30
x <- "hello world"

Dataset

penguins
# A tibble: 344 × 8
   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
 1 Adelie  Torgersen           39.1          18.7               181        3750
 2 Adelie  Torgersen           39.5          17.4               186        3800
 3 Adelie  Torgersen           40.3          18                 195        3250
 4 Adelie  Torgersen           NA            NA                  NA          NA
 5 Adelie  Torgersen           36.7          19.3               193        3450
 6 Adelie  Torgersen           39.3          20.6               190        3650
 7 Adelie  Torgersen           38.9          17.8               181        3625
 8 Adelie  Torgersen           39.2          19.6               195        4675
 9 Adelie  Torgersen           34.1          18.1               193        3475
10 Adelie  Torgersen           42            20.2               190        4250
# ℹ 334 more rows
# ℹ 2 more variables: sex <fct>, year <int>
glimpse(penguins)
Rows: 344
Columns: 8
$ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
$ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
$ bill_length_mm    <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
$ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
$ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
$ body_mass_g       <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
$ sex               <fct> male, female, female, NA, female, male, female, male…
$ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…

Create a ggplot

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
  ) +
  geom_point()
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

Adding aesthetics and layers

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species)
) +
  geom_point()
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

Add smooth Curve

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species)
) +
  geom_point() + 
  geom_smooth(method = 'lm')
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

Global level / Local level

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
  geom_point(mapping = aes(color = species)) +
  geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

Map species to the shape aesthetic

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
  geom_point(mapping = aes(color = species, shape = species)) +
  geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

Labels

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
  geom_point(aes(color = species, shape = species)) +
  geom_smooth(method = "lm") +
  labs(
    title = "Body mass and flipper length",
    subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins",
    x = "Flipper length (mm)", y = "Body mass (g)",
    color = "Species", shape = "Species"
  ) +
  scale_color_colorblind()
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

Exercises

dim(penguins)
[1] 344   8
str(penguins)
tibble [344 × 8] (S3: tbl_df/tbl/data.frame)
 $ species          : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ island           : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ bill_length_mm   : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
 $ bill_depth_mm    : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
 $ flipper_length_mm: int [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
 $ body_mass_g      : int [1:344] 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
 $ sex              : Factor w/ 2 levels "female","male": 2 1 1 NA 1 2 1 2 NA NA ...
 $ year             : int [1:344] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
ggplot(
  data = penguins,
  mapping = aes(x = bill_length_mm, y = bill_depth_mm)
  ) +
  geom_point() +
  geom_smooth(method = 'lm')
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

species vs. bill_depth_mm

ggplot(
  data = penguins,
  mapping = aes(x = species, y = bill_depth_mm)
  ) +
  geom_boxplot() + geom_jitter() 
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_boxplot()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(data = penguins, aes(x = bill_length_mm, y = body_mass_g)) + 
  geom_point() + geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

na.rm argument & captions

ggplot(data = penguins, aes(x = bill_length_mm, y = bill_depth_mm)) +
  geom_point(na.rm = TRUE) +
  labs(title = "Scatterplot with Missing Values Removed",
       x = "Bill Length (mm)",
       y = "Bill Depth (mm)",
       caption = "Data come from the palmerpenguins package")

Recreate Visualization

ggplot(data = penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point(aes(color = bill_depth_mm)) +  # Map bill_depth_mm to color at the geom level
  geom_smooth(method = "gam") +  # Add a single smooth line
   # Adjust gradient colors
  labs(
    title = "Scatterplot of Flipper Length vs. Body Mass",
    x = "Flipper Length (mm)",
    y = "Body Mass (g)",
    color = "Bill Depth (mm)"
  )
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g, color = island)
) +
  geom_point() +
  geom_smooth(se = FALSE)
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
  geom_point() +
  geom_smooth()
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot() +
  geom_point(
    data = penguins,
    mapping = aes(x = flipper_length_mm, y = body_mass_g)
  ) +
  geom_smooth(
    data = penguins,
    mapping = aes(x = flipper_length_mm, y = body_mass_g)
  )
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

gglot2 calls

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) + 
  geom_point()
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

#pipe
penguins |> 
  ggplot(aes(x = flipper_length_mm, y = body_mass_g)) + 
  geom_point()
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

Visualizing distributions

A categorical variable

ggplot(penguins, aes(x = species)) +
  geom_bar()

#reorder
ggplot(penguins, aes(x = fct_infreq(species))) +
  geom_bar()

A numerical variable

ggplot(penguins, aes(x = body_mass_g)) +
  geom_histogram(binwidth = 200)
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_bin()`).

ggplot(penguins, aes(x = body_mass_g)) +
  geom_histogram(binwidth = 20)
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_bin()`).

ggplot(penguins, aes(x = body_mass_g)) +
  geom_histogram(binwidth = 2000)
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_bin()`).

ggplot(penguins, aes(x = body_mass_g)) +
  geom_density()
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_density()`).

Exercises

barplot

ggplot(penguins, aes(y = species)) +
  geom_bar()

# assign species to the y aesthetic

####aesthetic

ggplot(penguins, aes(x = species)) +
  geom_bar(color = "red")

ggplot(penguins, aes(x = species)) +
  geom_bar(fill = "red")

histogram bins

# Default bin count (30)
ggplot(penguins, aes(x = body_mass_g)) +
  geom_histogram() +
  labs(title = "Default Bins (30)")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_bin()`).

# Using 10 bins
ggplot(penguins, aes(x = body_mass_g)) +
  geom_histogram(bins = 10) +
  labs(title = "10 Bins")
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_bin()`).

# Using 50 bins
ggplot(penguins, aes(x = body_mass_g)) +
  geom_histogram(bins = 50) +
  labs(title = "50 Bins")
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_bin()`).

binwidths

# binwidth 0,1
ggplot(diamonds, aes(x = carat)) +
  geom_histogram(binwidth = 0.1, fill = "skyblue", color = "black") +
  labs(
    title = "Histogram of Carat with Binwidth 0.1",
    x = "Carat",
    y = "Frequency"
  ) +
  theme_grey()

# Experiment with smaller binwidth (e.g., 0.01)
ggplot(diamonds, aes(x = carat)) +
  geom_histogram(binwidth = 0.01, fill = "lightgreen") +
  labs(
    title = "Histogram of Carat with Binwidth 0.01",
    x = "Carat",
    y = "Frequency"
  ) +
  theme_light()

# Experiment with larger binwidth (e.g., 0.5)
ggplot(diamonds, aes(x = carat)) +
  geom_histogram(binwidth = 0.5, fill = "salmon", color = "pink") +
  labs(
    title = "Histogram of Carat with Binwidth 0.5",
    x = "Carat",
    y = "Frequency"
  ) +
  theme_bw()

Visualizing relationships

A numerical and a categorical variable

ggplot(penguins, aes(x = species, y = body_mass_g)) +
  geom_boxplot()
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_boxplot()`).

ggplot(penguins, aes(x = body_mass_g, color = species)) +
  geom_density(linewidth = 0.75)
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_density()`).

ggplot(penguins, aes(x = body_mass_g, color = species, fill = species)) +
  geom_density(alpha = 0.5)
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_density()`).

Two categorical variables

ggplot(penguins, aes(x = island, fill = species)) +
  geom_bar()

ggplot(penguins, aes(x = island, fill = species)) +
  geom_bar(position = "fill")

Two numerical variables

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point()
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

Three or more variables

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point(aes(color = species, shape = island))
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point(aes(color = species, shape = species)) +
  facet_wrap(~island)
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

Exercises

head(mpg)
# A tibble: 6 × 11
  manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
  <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa…
2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa…
3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa…
4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa…
5 audi         a4      2.8  1999     6 auto(l5)   f        16    26 p     compa…
6 audi         a4      2.8  1999     6 manual(m5) f        18    26 p     compa…
# Scatterplot of hwy vs. displ
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  labs(title = "Scatterplot of hwy vs. displ", x = "Engine Displacement (L)", y = "Highway MPG")

ggplot(mpg, aes(x = displ, y = hwy, color = cty)) +
  geom_point() +
  labs(title = "Mapping Numerical Variable (cty) to Color", x = "Engine Displacement (L)", y = "Highway MPG", color = "City MPG")

ggplot(mpg, aes(x = displ, y = hwy, size = cty)) +
  geom_point() +
  labs(title = "Mapping Numerical Variable (cty) to Size", x = "Engine Displacement (L)", y = "Highway MPG", size = "City MPG")

ggplot(mpg, aes(x = displ, y = hwy, color = cty, size = cty)) +
  geom_point() +
  labs(title = "Mapping Numerical Variable (cty) to Color and Size", x = "Engine Displacement (L)", y = "Highway MPG", color = "City MPG", size = "City MPG")

ggplot(mpg, aes(x = displ, y = hwy, color = class, shape = class, size = class)) +
  geom_point() +
  labs(
    title = "Mapping Categorical Variable (class) to Multiple Aesthetics",
    x = "Engine Displacement (L)",
    y = "Highway MPG",
    color = "Class",
    shape = "Class",
    size = "Class"
  )
Warning: Using size for a discrete variable is not advised.
Warning: The shape palette can deal with a maximum of 6 discrete values because more
than 6 becomes difficult to discriminate
ℹ you have requested 7 values. Consider specifying shapes manually if you need
  that many have them.
Warning: Removed 62 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(penguins, aes(x = bill_length_mm, y = bill_depth_mm, color = species)) +
  geom_point() +
  labs(
    title = "Scatterplot of Bill Depth vs. Bill Length by Species",
    x = "Bill Length (mm)",
    y = "Bill Depth (mm)",
    color = "Species"
  )
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(penguins, aes(x = bill_length_mm, y = bill_depth_mm, color = species)) +
  geom_point() +
  facet_wrap(~species) +
  labs(
    title = "Bill Depth vs. Bill Length by Species (Faceted & Colored)",
    x = "Bill Length (mm)",
    y = "Bill Depth (mm)",
    color = "Species"
  )
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(
  data = penguins,
  mapping = aes(
    x = bill_length_mm, y = bill_depth_mm, 
    color = species, shape = species
  )
) +
  geom_point() +
  labs(color = "Species")
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

#combine two legends
ggplot(
  data = penguins,
  mapping = aes(
    x = bill_length_mm,
    y = bill_depth_mm,
    color = species,
    shape = species
  )
) +
  geom_point() +
  labs(color = "Species", shape = "Species")  # Use the same label for both aesthetics
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(penguins, aes(x = island, fill = species)) +
  geom_bar(position = "fill")

ggplot(penguins, aes(x = species, fill = island)) +
  geom_bar(position = "fill")

Saving plots

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point(color = "skyblue")
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

ggsave(filename = "penguin-plot.png")
Saving 7 x 5 in image
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).
# Assign plots to variables
plot1 <- ggplot(mpg, aes(x = class)) +
  geom_bar(fill = "salmon")

plot2 <- ggplot(mpg, aes(x = cty, y = hwy)) +
  geom_point(aes(color = displ))

# Save a specific plot
ggsave("mpgplot.png", plot = plot1)
Saving 7 x 5 in image
ggsave("mpg-plot.pdf", plot = plot2)  # Saves plot2 as a PDF
Saving 7 x 5 in image
plot1

plot2

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, color = drv))

Data transformation

library(nycflights13)
flights
# A tibble: 336,776 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 8  2013     1     1      557            600        -3      709            723
 9  2013     1     1      557            600        -3      838            846
10  2013     1     1      558            600        -2      753            745
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>
glimpse(flights)
Rows: 336,776
Columns: 19
$ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
$ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
$ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
$ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
$ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
$ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
$ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
$ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
$ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
$ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
$ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
$ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
$ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
$ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…

dplyr

flights |>
  filter(dest == "IAH") |> 
  group_by(year, month, day) |> 
  summarize(
    arr_delay = mean(arr_delay, na.rm = TRUE)
  )
`summarise()` has grouped output by 'year', 'month'. You can override using the
`.groups` argument.
# A tibble: 365 × 4
# Groups:   year, month [12]
    year month   day arr_delay
   <int> <int> <int>     <dbl>
 1  2013     1     1     17.8 
 2  2013     1     2      7   
 3  2013     1     3     18.3 
 4  2013     1     4     -3.2 
 5  2013     1     5     20.2 
 6  2013     1     6      9.28
 7  2013     1     7     -7.74
 8  2013     1     8      7.79
 9  2013     1     9     18.1 
10  2013     1    10      6.68
# ℹ 355 more rows

rows

flights |> 
  filter(dep_delay > 120)
# A tibble: 9,723 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      848           1835       853     1001           1950
 2  2013     1     1      957            733       144     1056            853
 3  2013     1     1     1114            900       134     1447           1222
 4  2013     1     1     1540           1338       122     2020           1825
 5  2013     1     1     1815           1325       290     2120           1542
 6  2013     1     1     1842           1422       260     1958           1535
 7  2013     1     1     1856           1645       131     2212           2005
 8  2013     1     1     1934           1725       129     2126           1855
 9  2013     1     1     1938           1703       155     2109           1823
10  2013     1     1     1942           1705       157     2124           1830
# ℹ 9,713 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>
flights |> 
  filter(month == 1 & day == 1)
# A tibble: 842 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 8  2013     1     1      557            600        -3      709            723
 9  2013     1     1      557            600        -3      838            846
10  2013     1     1      558            600        -2      753            745
# ℹ 832 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>
flights |> 
  filter(month %in% c(1, 2))
# A tibble: 51,955 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 8  2013     1     1      557            600        -3      709            723
 9  2013     1     1      557            600        -3      838            846
10  2013     1     1      558            600        -2      753            745
# ℹ 51,945 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>
flights |> 
  arrange(desc(dep_delay))
# A tibble: 336,776 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     9      641            900      1301     1242           1530
 2  2013     6    15     1432           1935      1137     1607           2120
 3  2013     1    10     1121           1635      1126     1239           1810
 4  2013     9    20     1139           1845      1014     1457           2210
 5  2013     7    22      845           1600      1005     1044           1815
 6  2013     4    10     1100           1900       960     1342           2211
 7  2013     3    17     2321            810       911      135           1020
 8  2013     6    27      959           1900       899     1236           2226
 9  2013     7    22     2257            759       898      121           1026
10  2013    12     5      756           1700       896     1058           2020
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>
flights |> 
  distinct()
# A tibble: 336,776 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 8  2013     1     1      557            600        -3      709            723
 9  2013     1     1      557            600        -3      838            846
10  2013     1     1      558            600        -2      753            745
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>