Data Transformations & Manipulation with dplyr

Learning Objectives

In this activity we will explore some key concepts of data cleaning and data wrangling with R using the dplyr package.

  • Subsetting or selecting rows / observations with filter()

  • Sorting or arranging rows of a data set with arrange()

  • Rearranging and dropping columns / variables with select()

  • Modifying existing columns and creating new columns with mutate()

  • Data operations and summary statistics by sub-groups with group_by() and summarize()

Let’s load some packages to begin with

library(tidyverse)
library(skimr)
library(scales)
library(flextable)
library(mice)
library(naniar)

Next we import the data set on Michigan flights

# This data set is in my computer storage unfortunately
miFlights <- read_csv("miFlights2019-2021.csv")

Let’s Use the skim() function to explore characteristics of the data set.

skim(miFlights)
Data summary
Name miFlights
Number of rows 463818
Number of columns 37
_______________________
Column type frequency:
character 9
numeric 27
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
carrier 0 1.00 2 2 0 16 0
tailnum 2189 1.00 3 6 0 5250 0
origin 0 1.00 3 3 0 4 0
dest 0 1.00 3 3 0 130 0
carrier_name 0 1.00 9 34 0 16 0
plane_type 11140 0.98 23 23 0 1 0
plane_manufacturer 11140 0.98 6 29 0 16 0
plane_model 11140 0.98 5 15 0 93 0
plane_engine 11140 0.98 9 9 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2019.92 0.85 2019.00 2019.00 2020.00 2021.00 2021.00 ▇▁▆▁▆
month 0 1.00 6.53 3.48 1.00 3.00 7.00 10.00 12.00 ▇▅▅▅▇
day 0 1.00 15.74 8.76 1.00 8.00 16.00 23.00 31.00 ▇▇▇▇▆
dep_time 9060 0.98 1372.26 490.52 1.00 950.00 1355.00 1754.00 2400.00 ▁▇▇▇▆
sched_dep_time 0 1.00 1368.40 481.75 49.00 948.00 1355.00 1750.00 2336.00 ▁▇▇▇▆
dep_delay 9063 0.98 7.11 44.97 -54.00 -5.00 -3.00 0.00 2672.00 ▇▁▁▁▁
arr_time 9324 0.98 1481.01 506.74 1.00 1053.00 1502.00 1905.00 2400.00 ▁▅▇▇▆
sched_arr_time 0 1.00 1496.79 495.16 1.00 1103.00 1510.00 1910.00 2359.00 ▁▃▇▇▇
arr_delay 10239 0.98 0.16 47.21 -85.00 -17.00 -9.00 2.00 2649.00 ▇▁▁▁▁
flight 0 1.00 413.37 269.57 1.00 189.00 387.00 600.00 1322.00 ▇▇▆▂▁
air_time 10239 0.98 94.59 63.12 15.00 50.00 74.00 130.00 581.00 ▇▂▁▁▁
distance 0 1.00 641.00 488.23 74.00 296.00 500.00 957.00 4475.00 ▇▂▁▁▁
hour 0 1.00 13.41 4.79 0.00 9.00 13.00 17.00 23.00 ▁▇▇▇▆
minute 0 1.00 27.49 17.94 0.00 11.00 27.00 44.00 59.00 ▇▆▇▆▆
temp 441760 0.05 42.16 15.40 -4.00 32.00 37.90 48.90 90.00 ▁▆▇▂▁
dewp 441762 0.05 31.91 13.42 -9.00 23.00 28.90 39.90 75.90 ▁▆▇▃▁
humid 441773 0.05 68.87 15.10 25.87 57.93 71.82 80.66 100.00 ▁▃▅▇▃
wind_dir 9205 0.98 181.02 109.46 0.00 80.00 200.00 270.00 360.00 ▇▃▆▇▆
wind_speed 4367 0.99 8.59 5.64 0.00 4.60 8.06 11.51 42.58 ▇▆▁▁▁
wind_gust 4367 0.99 9.88 6.50 0.00 5.30 9.27 13.24 49.00 ▇▆▁▁▁
precip 430846 0.07 0.01 0.02 0.00 0.00 0.00 0.01 0.44 ▇▁▁▁▁
pressure 447131 0.04 1018.83 7.60 990.40 1014.10 1019.00 1023.40 1038.50 ▁▂▇▇▂
visib 1934 1.00 8.18 2.56 0.06 7.00 10.00 10.00 10.00 ▁▁▁▂▇
plane_year 21647 0.95 2008.12 7.15 1987.00 2003.00 2007.00 2015.00 2021.00 ▁▂▇▃▅
plane_engines 11140 0.98 2.00 0.02 2.00 2.00 2.00 2.00 3.00 ▇▁▁▁▁
plane_seats 11140 0.98 127.86 66.68 20.00 80.00 95.00 182.00 451.00 ▇▃▂▁▁
plane_speed 11140 0.98 0.01 1.72 0.00 0.00 0.00 0.00 438.00 ▇▁▁▁▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
time_hour 0 1 2019-01-01 05:00:00 2021-12-31 22:00:00 2020-03-26 06:00:00 19059

Bar charts

Create a bar chart showing how many flights departed out of each airport (origin) using the count() and geom_col() functions. Also sort the bars by descending height using the fct_reorder() function.

miFlights |>
  count(origin) |>
  ggplot(aes(x = fct_reorder(origin, n, .desc = TRUE),
             y = n, fill = origin)) +
  scale_fill_viridis_d() + 
  scale_y_continuous(labels = label_comma(),
                     expand = expansion(mult = c(0, 0.1))) +
  geom_col(color='black') +
labs(title = "Michigan Flights, 2019-2021",
     y = "Number of Flights",
     x = "Origin Airport",
     caption = "Data source: anyflights R package") +
  ggthemes::theme_few() +
  theme(legend.position = "none")

Reproduce the same bar chart using geom_bar(). Hint: one way to sort the bars by height with geom_bar()`` is to use thefct_infreq()` function.

miFlights |>
  ggplot(aes(x = fct_infreq(origin,ordered = TRUE), fill = origin)) +
  geom_bar(color = 'black') +
  scale_fill_viridis_d() + 
  scale_y_continuous(labels = label_comma()) +
  # geom_col(color='black') +
labs(title = "Michigan Flights, 2019-2021",
     y = "Number of Flights",
     x = "Origin Airport",
     caption = "Data source: anyflights R package") +
  ggthemes::theme_few() +
  theme(legend.position = "none")

miFlights |>
  ggplot(aes(x = fct_infreq(origin), fill = origin)) +
  scale_fill_viridis_d() + 
  scale_y_continuous(labels = label_comma(),
                     expand = expansion(mult = c(0, 0.1))) +
  geom_bar(color='black') +
labs(title = "Michigan Flights, 2019-2021",
     y = "Number of Flights",
     x = "Origin Airport",
     caption = "Data source: anyflights R package") +
  ggthemes::theme_few() +
  theme(legend.position = "none")

Variable types

# Let's examine the variable types in our data set
glimpse(miFlights)
## Rows: 463,818
## Columns: 37
## $ year               <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 201…
## $ month              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ day                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ dep_time           <dbl> 55, 455, 506, 531, 534, 550, 555, 555, 555, 600, 60…
## $ sched_dep_time     <dbl> 2115, 500, 511, 535, 545, 600, 600, 555, 600, 600, …
## $ dep_delay          <dbl> 220, -5, -5, -4, -11, -10, -5, 0, -5, 0, -2, 3, 4, …
## $ arr_time           <dbl> 426, 830, 710, 647, 750, 712, 822, 709, 755, 559, 9…
## $ sched_arr_time     <dbl> 2323, 834, 730, 710, 742, 748, 834, 715, 817, 615, …
## $ arr_delay          <dbl> 303, -4, -20, -23, 8, -36, -12, -6, -22, -16, 3, -7…
## $ carrier            <chr> "OH", "YX", "AA", "WN", "B6", "YX", "OO", "WN", "DL…
## $ flight             <dbl> 1019, 954, 185, 203, 310, 790, 803, 295, 348, 218, …
## $ tailnum            <chr> "N567NN", "N433YX", "N853NN", "N227WN", "N203JB", "…
## $ origin             <chr> "DTW", "GRR", "DTW", "DTW", "DTW", "DTW", "FNT", "D…
## $ dest               <chr> "CLT", "MIA", "DFW", "BWI", "BOS", "EWR", "ATL", "D…
## $ air_time           <dbl> 88, 187, 162, 64, 71, 61, 125, 174, 105, 45, 179, 1…
## $ distance           <dbl> 500, 1214, 986, 409, 632, 488, 645, 1123, 640, 228,…
## $ hour               <dbl> 21, 5, 5, 5, 5, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6,…
## $ minute             <dbl> 15, 0, 11, 35, 45, 0, 0, 55, 0, 0, 3, 0, 0, 5, 0, 1…
## $ time_hour          <dttm> 2019-01-01 21:00:00, 2019-01-01 05:00:00, 2019-01-…
## $ temp               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ dewp               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ humid              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ wind_dir           <dbl> 10, 320, 290, 290, 290, 290, 330, 290, 290, 290, 29…
## $ wind_speed         <dbl> 8.05546, 9.20624, 19.56326, 19.56326, 19.56326, 13.…
## $ wind_gust          <dbl> 9.270062, 10.594357, 22.513008, 22.513008, 22.51300…
## $ precip             <dbl> NA, 1e-04, NA, NA, NA, NA, 1e-04, NA, NA, NA, NA, N…
## $ pressure           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ visib              <dbl> 1.25, 9.00, 7.00, 7.00, 7.00, 4.00, 9.00, 7.00, 10.…
## $ carrier_name       <chr> "PSA Airlines Inc.", "Republic Airline", "American …
## $ plane_year         <dbl> 2015, 2014, 2010, 2005, 2006, 2016, 2006, 1999, 200…
## $ plane_type         <chr> "Fixed wing multi engine", "Fixed wing multi engine…
## $ plane_manufacturer <chr> "BOMBARDIER INC", "EMBRAER S A", "BOEING", "BOEING"…
## $ plane_model        <chr> "CL-600-2D24", "ERJ 170-200 LR", "737-823", "737-7H…
## $ plane_engines      <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ plane_seats        <dbl> 95, 88, 162, 140, 20, 88, 95, 149, 100, 140, 162, 1…
## $ plane_speed        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ plane_engine       <chr> "Turbo-fan", "Turbo-fan", "Turbo-fan", "Turbo-fan",…

The filter function for subsetting rows

Selecting all flights on January 1st in the data set, create a new object called janFlights.

# Subsetting to flights that departed in January 1st
janFlights <- miFlights |>
  dplyr::filter(month == 1, day == 1)

Suppose we want to create a data set called dec25 that contains flight data from December 25th. What code would we need using the filter() function to create dec25?

# Christmas day flights
dec25 <- miFlights |>
  dplyr::filter(month == 12 & day == 25)

Find all flights that departed in November or December, creating an object called novDec.

# Flights in December or November
novDec <- miFlights |>
  dplyr::filter(month == 12 | month == 11)

# or

notNovDec <- miFlights |>
  dplyr::filter(month %in% 11:12 )

Find all flights that departed in summer break.

# Flights in summer break
summer_break_flights <- miFlights |>
  dplyr::filter(month %in% 5:8)

Select all flights except those in the months of November or December using !.

# Flights not in Nov and not in Dec
notNovDec <- miFlights |>
  dplyr::filter(month !=12 & month != 11)

Knowing that arr_delay and dep_delay represent the arrival and departure delays in minutes respectively, what data set is produced using the code below?

dplyr::filter(miFlights, !(arr_delay > 120 | dep_delay > 120))
## # A tibble: 444,025 × 37
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <dbl> <dbl> <dbl>    <dbl>          <dbl>     <dbl>    <dbl>          <dbl>
##  1  2019     1     1      455            500        -5      830            834
##  2  2019     1     1      506            511        -5      710            730
##  3  2019     1     1      531            535        -4      647            710
##  4  2019     1     1      534            545       -11      750            742
##  5  2019     1     1      550            600       -10      712            748
##  6  2019     1     1      555            600        -5      822            834
##  7  2019     1     1      555            555         0      709            715
##  8  2019     1     1      555            600        -5      755            817
##  9  2019     1     1      600            600         0      559            615
## 10  2019     1     1      601            603        -2      927            924
## # ℹ 444,015 more rows
## # ℹ 29 more variables: arr_delay <dbl>, carrier <chr>, flight <dbl>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, temp <dbl>, dewp <dbl>,
## #   humid <dbl>, wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## #   precip <dbl>, pressure <dbl>, visib <dbl>, carrier_name <chr>,
## #   plane_year <dbl>, plane_type <chr>, plane_manufacturer <chr>, …
dplyr::filter(miFlights, arr_delay <= 120, dep_delay <= 120)
## # A tibble: 444,025 × 37
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <dbl> <dbl> <dbl>    <dbl>          <dbl>     <dbl>    <dbl>          <dbl>
##  1  2019     1     1      455            500        -5      830            834
##  2  2019     1     1      506            511        -5      710            730
##  3  2019     1     1      531            535        -4      647            710
##  4  2019     1     1      534            545       -11      750            742
##  5  2019     1     1      550            600       -10      712            748
##  6  2019     1     1      555            600        -5      822            834
##  7  2019     1     1      555            555         0      709            715
##  8  2019     1     1      555            600        -5      755            817
##  9  2019     1     1      600            600         0      559            615
## 10  2019     1     1      601            603        -2      927            924
## # ℹ 444,015 more rows
## # ℹ 29 more variables: arr_delay <dbl>, carrier <chr>, flight <dbl>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, temp <dbl>, dewp <dbl>,
## #   humid <dbl>, wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## #   precip <dbl>, pressure <dbl>, visib <dbl>, carrier_name <chr>,
## #   plane_year <dbl>, plane_type <chr>, plane_manufacturer <chr>, …

Create a new object called miFlightsComplete where all departure times are non-missing, and miFlightsMiss where all departure times are missing

# Rows with non-missing departure time
miFlightsComplete <- miFlights |>
  dplyr::filter(is.na(dep_time) == FALSE)

# Rows with missing departure time
miFlightsMiss <- miFlights |>
  dplyr::filter(is.na(dep_time) == TRUE)

Arrange rows with arrange(). The arrange() function for sorting rows

Sort miFlights by the day of the flight (smallest to largest), and print the first 4 columns and 5 rows of the resulting data set using the slice_head() function.

library(gt)
miFlights |>
  arrange(day) |>
  dplyr::select(1:4) |>
  slice_head(n = 5) |>
  gt()
year month day dep_time
2019 1 1 55
2019 1 1 455
2019 1 1 506
2019 1 1 531
2019 1 1 534

Sort miFlights by the day of the flight (largest to smallest), and print the first 4 columns and 5 rows of the resulting data set using the slice_head() function.

library(gt)
miFlights |>
  arrange(desc(day)) |>
  dplyr::select(1:4) |>
  slice_head(n = 5) |>
  gt()
year month day dep_time
2019 1 31 59
2019 1 31 535
2019 1 31 540
2019 1 31 548
2019 1 31 549

Sort miFlights by the year, month, and day of the flight.

library(gt)
miFlights |>
  arrange(year, month, day) |>
  dplyr::select(1:4) |>
  slice_head(n = 5) |>
  gt()
year month day dep_time
2019 1 1 55
2019 1 1 455
2019 1 1 506
2019 1 1 531
2019 1 1 534
# for desc

miFlights |>
  arrange(year, desc(month), day) |>
  dplyr::select(1:4) |>
  slice_head(n = 5) |>
  gt()
year month day dep_time
2019 12 1 12
2019 12 1 46
2019 12 1 48
2019 12 1 58
2019 12 1 108

Sort miFlights to find the 3 most delayed flights (arr_delay), and the 3 that left the earliest relative to their scheduled departure (dep_delay).

# 3 most delayed flights

miFlights |>
  arrange(desc(arr_delay)) |>
  # dplyr::select(1:8) |>
  slice_head(n = 3) |>
  gt()
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute time_hour temp dewp humid wind_dir wind_speed wind_gust precip pressure visib carrier_name plane_year plane_type plane_manufacturer plane_model plane_engines plane_seats plane_speed plane_engine
2019 2 17 1400 1728 2672 1531 1922 2649 MQ 540 N817AE DTW LGA 70 502 17 28 2019-02-17 17:00:00 NA NA NA 80 10.35702 11.918651 NA NA 10 Envoy Air 2002 Fixed wing multi engine EMBRAER EMB-135KL 2 37 0 Turbo-jet
2021 7 12 1643 815 1948 1754 913 1961 G4 216 219NV GRR LAS 220 1642 8 15 2021-07-12 08:00:00 NA NA NA 50 8.05546 9.270062 NA NA 10 Allegiant Air NA NA NA NA NA NA NA NA
2019 12 19 1402 722 1840 1556 1004 1792 AA 93 N807AW GRR PHX 216 1574 7 22 2019-12-19 07:00:00 NA NA NA 150 5.75390 6.621473 NA NA 10 American Airlines Inc. 1999 Fixed wing multi engine AIRBUS INDUSTRIE A319-132 2 179 0 Turbo-jet
# 3 that left the earliest

miFlights |>
  arrange(dep_delay) |>
  # dplyr::select(1:8) |>
  slice_head(n = 3) |>
  gt()
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute time_hour temp dewp humid wind_dir wind_speed wind_gust precip pressure visib carrier_name plane_year plane_type plane_manufacturer plane_model plane_engines plane_seats plane_speed plane_engine
2020 3 29 2019 2113 -54 2301 2344 -43 G4 102 229NV FNT PIE 132 1040 21 13 2020-03-29 21:00:00 NA NA NA 240 16.11092 18.540125 NA NA 10 Allegiant Air NA NA NA NA NA NA NA NA
2020 3 31 1521 1610 -49 1750 1842 -52 G4 102 308NV FNT PIE 133 1040 16 10 2020-03-31 16:00:00 NA NA NA 30 5.75390 6.621473 NA NA 10 Allegiant Air NA NA NA NA NA NA NA NA
2019 1 11 2059 2145 -46 2250 2348 -58 NK 20 N614NK DTW DFW 146 986 21 45 2019-01-11 21:00:00 NA NA NA 150 5.75390 6.621473 NA NA 10 Spirit Air Lines 2012 Fixed wing multi engine AIRBUS A320-232 2 200 0 Turbo-fan

Sort miFlights to find the fastest (highest speed) flights using a function of the variables distance and air_time.

# Fastest flight

miFlights |>
  arrange(desc(distance / air_time)) |>
  slice_head(n = 2) |>
  gt()
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute time_hour temp dewp humid wind_dir wind_speed wind_gust precip pressure visib carrier_name plane_year plane_type plane_manufacturer plane_model plane_engines plane_seats plane_speed plane_engine
2019 7 14 1230 1215 15 1339 1326 13 OO 572 N837SK DTW TVC 17 207 12 15 2019-07-14 12:00:00 NA NA NA 340 5.7539 6.621473 NA NA 10 SkyWest Airlines Inc. 2019 Fixed wing multi engine BOMBARDIER INC CL-600-2D24 2 95 0 Turbo-fan
2020 8 21 1645 1605 40 1621 1626 -5 OO 300 N885AS DTW GRB 25 287 16 5 2020-08-21 16:00:00 NA NA NA 0 0.0000 0.000000 NA NA 10 SkyWest Airlines Inc. 2001 Fixed wing multi engine BOMBARDIER INC CL-600-2B19 2 55 0 Turbo-fan

For flights coming out of GRR, find the 3 flights that traveled the farthest (distance) and that arrived the earliest in the morning (arr_time) simultaneously.

miFlights |>
  filter(origin == "GRR") |>
  arrange(desc(distance), arr_time) |>
  slice_head(n = 3)
## # A tibble: 3 × 37
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <dbl> <dbl> <dbl>    <dbl>          <dbl>     <dbl>    <dbl>          <dbl>
## 1  2021     3    28      555            600        -5      714            758
## 2  2021     4    11      552            600        -8      716            740
## 3  2021     3    18      558            600        -2      720            758
## # ℹ 29 more variables: arr_delay <dbl>, carrier <chr>, flight <dbl>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, temp <dbl>, dewp <dbl>,
## #   humid <dbl>, wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## #   precip <dbl>, pressure <dbl>, visib <dbl>, carrier_name <chr>,
## #   plane_year <dbl>, plane_type <chr>, plane_manufacturer <chr>,
## #   plane_model <chr>, plane_engines <dbl>, plane_seats <dbl>, …

Selecting columns with select() function

Drop the year and month columns from miFlights creating a new data set called miDropped.

# Dropping year and month
miDropped <- miFlights |>
  dplyr::select(-year, -month)

We can also drop sets of contiguous or touching columns:

Drop all variables between year and day columns (inclusive) from miFlights creating a new data set called miDropped2.

miDropped2 <- miFlights |>
  dplyr::select(-c(year:day))

Another option is to use select() in tandem with the everything() helper.

miFlights |>
  dplyr::select(time_hour, air_time, everything()) |>
  slice_head(n = 5)
## # A tibble: 5 × 37
##   time_hour           air_time  year month   day dep_time sched_dep_time
##   <dttm>                 <dbl> <dbl> <dbl> <dbl>    <dbl>          <dbl>
## 1 2019-01-01 21:00:00       88  2019     1     1       55           2115
## 2 2019-01-01 05:00:00      187  2019     1     1      455            500
## 3 2019-01-01 05:00:00      162  2019     1     1      506            511
## 4 2019-01-01 05:00:00       64  2019     1     1      531            535
## 5 2019-01-01 05:00:00       71  2019     1     1      534            545
## # ℹ 30 more variables: dep_delay <dbl>, arr_time <dbl>, sched_arr_time <dbl>,
## #   arr_delay <dbl>, carrier <chr>, flight <dbl>, tailnum <chr>, origin <chr>,
## #   dest <chr>, distance <dbl>, hour <dbl>, minute <dbl>, temp <dbl>,
## #   dewp <dbl>, humid <dbl>, wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## #   precip <dbl>, pressure <dbl>, visib <dbl>, carrier_name <chr>,
## #   plane_year <dbl>, plane_type <chr>, plane_manufacturer <chr>,
## #   plane_model <chr>, plane_engines <dbl>, plane_seats <dbl>, …

We can use select() and everything() to rearrange columns and still drop columns too:

miFlights |>
  dplyr::select(time_hour, air_time, everything(), -day) |>
  slice_head(n=5)
## # A tibble: 5 × 36
##   time_hour           air_time  year month dep_time sched_dep_time dep_delay
##   <dttm>                 <dbl> <dbl> <dbl>    <dbl>          <dbl>     <dbl>
## 1 2019-01-01 21:00:00       88  2019     1       55           2115       220
## 2 2019-01-01 05:00:00      187  2019     1      455            500        -5
## 3 2019-01-01 05:00:00      162  2019     1      506            511        -5
## 4 2019-01-01 05:00:00       64  2019     1      531            535        -4
## 5 2019-01-01 05:00:00       71  2019     1      534            545       -11
## # ℹ 29 more variables: arr_time <dbl>, sched_arr_time <dbl>, arr_delay <dbl>,
## #   carrier <chr>, flight <dbl>, tailnum <chr>, origin <chr>, dest <chr>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, temp <dbl>, dewp <dbl>,
## #   humid <dbl>, wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## #   precip <dbl>, pressure <dbl>, visib <dbl>, carrier_name <chr>,
## #   plane_year <dbl>, plane_type <chr>, plane_manufacturer <chr>,
## #   plane_model <chr>, plane_engines <dbl>, plane_seats <dbl>, …

Create a subset of the miFlights data set called timeFlights that only contains variables that end with the word “time”.

timeFlights <- miFlights |>
  dplyr::select(ends_with("time")) |>
  slice_head(n = 5)

Create a new data frame called departureInfo that only has variables that start with “dep”

departureInfo <- miFlights |>
  dplyr::select(starts_with("dep")) |>
  slice_head(n = 5)

Create a new data frame called newFlights by rearranging the columns of the full miFlights data set so that flight number (flight), origin (origin), and destination (dest) are provided first, then all other columns except the tail number (tailnum).

newFlights <- miFlights |>
  dplyr::select(flight, origin, dest, everything(), -tailnum) |>
  slice_head( n=5)

Adding new variables with mutate() function

We can create a new variable, gain, that is a function of existing variables in miFlights:

flights_sml <- miFlights |> dplyr::select(ends_with("delay"), distance, air_time)

# Creating time gained variable
flights_sml |> mutate(gain = dep_delay - arr_delay) |> 
  slice_head(n = 5)
## # A tibble: 5 × 5
##   dep_delay arr_delay distance air_time  gain
##       <dbl>     <dbl>    <dbl>    <dbl> <dbl>
## 1       220       303      500       88   -83
## 2        -5        -4     1214      187    -1
## 3        -5       -20      986      162    15
## 4        -4       -23      409       64    19
## 5       -11         8      632       71   -19

Extending the code provided with a single call to mutate(), create a new variable, speed, that is equal to distance divided by air_time, producing a new data set called flightSpeeds.

# Culculating speed variable
flightSpeeds <- flights_sml |>
  mutate(gain = dep_delay - arr_delay,
         speed_mph = distance / air_time*60)

Visualizing relationship between flights speeds and gains

Create a plot showing the relationship between the speed and time gain of each flight, adding appropriate axis and title labels.

Add color to the plot to display the distance the flight traveled as well. Is there a noticeable pattern?

flightSpeeds |>
  slice_sample(n = 10000) |>
  ggplot(aes(x = speed_mph, 
             y = gain,
             color = distance)) +
  geom_point(alpha = 0.1) +
  labs(title = "michigan Flights, 2019-2021",
       x = "Speed(miles per hour)",
       y = "Gain(minutes)",
       caption = "Data source: anyflights R package",
       color = "Distance(in miles)") +
  ggthemes::theme_few()

Group-wise operations and statistics with group_by() & summarize()

Together group_by() and summarise() provide useful tools: grouped data operations and summaries.

miFlights |> group_by(year, month, day) |> 
summarize(delay = mean(dep_delay, na.rm = TRUE)) |> 
  ungroup() |>
  slice_head(n = 5)
## # A tibble: 5 × 4
##    year month   day  delay
##   <dbl> <dbl> <dbl>  <dbl>
## 1  2019     1     1  8.58 
## 2  2019     1     2 12.5  
## 3  2019     1     3  0.721
## 4  2019     1     4 -0.477
## 5  2019     1     5  2.17

Waterfall plot

Let’s see which airlines tend to have the worst delays. The code below creates a summary table containing the average flight delay in minutes for each carrier.

Reproduce the waterfall plot below using this summary table and the colors c(“#D55E00”, “#0072B2”).

# Calculating average flight delay by carrier
delaySummary <- miFlights |> 
  group_by(carrier_name) |>
  summarize(Delay = mean(arr_delay, na.rm = TRUE))


# Creating waterfall chart
delaySummary |>
  ggplot(aes(x = carrier_name,
             y = Delay)) +
  geom_col() +
  labs(title =  "Average Flight Delays per Carrrier",
       subtitle = "Michigan Flights, 2019-2021",
       x = "Carrier",
       y = "Delay(Minutes)",
       caption ="Data source: anyflights R package") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Re-order the bars height based on Delay, add colorand add theme

# Calculating average flight delay by carrier
delaySummary <- miFlights |> group_by(carrier_name) |> 
  group_by(carrier_name) |>
  summarize(Delay = mean(arr_delay, na.rm = TRUE))



# Creating waterfall chart
delaySummary |>
  ggplot(aes(x = fct_reorder(carrier_name, Delay, .desc = FALSE),
             y = Delay,
             fill = Delay > 0)) +
  geom_col() +
    scale_fill_manual(values = c("#D55E00", "#0072B2")) +
  labs(title =  "Average Flight Delays per Carrrier",
       subtitle = "Michigan Flights, 2019-2021",
       x = "Carrier",
       y = "Delay(Minutes)",
       caption ="Data source: anyflights R package") +
  theme_bw(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none",
        title = element_text(face = "bold"))

Create a data frame summarizing the median flight delay (arr_delay) by month. Which month has the worst delays? In which month are flights most early / on-time?

# Median flight delay by month and get one with worst delays
worst_delay_month <- miFlights |>
  group_by(month) |>
  summarize(median_arr_delay = median(arr_delay, na.rm = TRUE)) |>
  arrange(desc(median_arr_delay)) |>
  slice(1)

June has worst delays

# Month with most on-time flights
on_time_month <- miFlights |>
  group_by(month) |>
  summarize(median_arr_delay = median(arr_delay, na.rm = TRUE)) |>
  arrange(median_arr_delay) |>
  slice(1)

January has the most on-time flights

Missing Values

Use the gg_miss_var() function from the naniar package to visualize missingness for each variable individually using a lollipop chart.

miFlights |>
  gg_miss_var(show_pct = TRUE)

Reproduce the visualization below using the facet argument with gg_miss_var().

miFlights |>
  gg_miss_var(show_pct = TRUE,
              facet = origin)

Another way to visualize this missingness is using the gg_miss_fct() function from the naniar package. Recreate the mosaic plot showing missingness for each variable depending on the origin airport using the gg_miss_fct() function.

miFlights |>
  dplyr::select(origin, dest, carrier,
                contains(c("time", "delay"))) |>
  gg_miss_fct(fct = origin)

Use the gg_miss_fct() function from the naniar package to visualize missingness for each variable depending on the carrier.

miFlights |>
  gg_miss_fct(fct = carrier_name)