dplyr and tidyr are part of tidyverse
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Get the untidy airlines data
untidy_data <- read.csv("flights.csv")
head(untidy_data)
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
#Rename the columns
names(untidy_data) <- c("Airline", "Departure", "Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle")
head(untidy_data)
## Airline Departure Los Angeles Phoenix San Diego San Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
Tidy it up
# Apologies for doing this. RStudio would work correctly but suddenly decides it doesn't know how to count. It gave the result TRUE TRUE TRUE TRUE TRUE for nchar(untidy_data$Airline) > 0
# nchar(untidy$Airline) also gave the incorrect result of 6 6 6 6 6
untidy_data$Airline[2] = untidy_data$Airline[1]
untidy_data$Airline[5] = untidy_data$Airline[4]
airlines <- untidy_data |>
pivot_longer(`Los Angeles`:`Seattle`, names_to = "Destination", values_to = "Times", values_drop_na = TRUE)
str(airlines)
## tibble [20 × 4] (S3: tbl_df/tbl/data.frame)
## $ Airline : chr [1:20] "ALASKA" "ALASKA" "ALASKA" "ALASKA" ...
## $ Departure : chr [1:20] "on time" "on time" "on time" "on time" ...
## $ Destination: chr [1:20] "Los Angeles" "Phoenix" "San Diego" "San Francisco" ...
## $ Times : int [1:20] 497 221 212 503 1841 62 12 20 102 305 ...
head(airlines)
## # A tibble: 6 × 4
## Airline Departure Destination Times
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San Diego 212
## 4 ALASKA on time San Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 ALASKA delayed Los Angeles 62
Comparing Arrival Delays
#Delayed Flights
## AM West have more delayed flights on average along with having a higher median of delayed flights
airlines |>
group_by(Airline) |>
filter(Departure == "delayed") |>
summarize(average = mean(Times), median = median(Times), standard_deviation = sd(Times))
## # A tibble: 2 × 4
## Airline average median standard_deviation
## <chr> <dbl> <int> <dbl>
## 1 ALASKA 100. 62 120.
## 2 AM WEST 157. 117 147.
delayed_flights <- airlines |>
filter(Departure == "delayed")
ggplot(delayed_flights, aes(x = Airline, y = Times)) + geom_point(aes(color = Destination))
# On Time Flights
## AM West also have a higher average of being on time for flights than Alaska Airlines, it have a lower median though
airlines |>
group_by(Airline) |>
filter(Departure == "on time") |>
summarize(average = mean(Times), median = median(Times), standard_deviation = sd(Times))
## # A tibble: 2 × 4
## Airline average median standard_deviation
## <chr> <dbl> <int> <dbl>
## 1 ALASKA 655. 497 678.
## 2 AM WEST 1288. 383 1994.
ontime_flights <- airlines |>
filter(Departure == "on time")
ggplot(ontime_flights, aes(x = Airline, y = Times)) + geom_point(aes(color = Destination))
# Delayed Flights be Destination
airlines |>
group_by(Destination) |>
filter(Departure == "delayed") |>
summarize(average = mean(Times), median = median(Times))
## # A tibble: 5 × 3
## Destination average median
## <chr> <dbl> <dbl>
## 1 Los Angeles 89.5 89.5
## 2 Phoenix 214. 214.
## 3 San Diego 42.5 42.5
## 4 San Francisco 116. 116.
## 5 Seattle 183 183
# On Time Flights be Destination
airlines |>
group_by(Destination) |>
filter(Departure == "on time") |>
summarize(average = mean(Times), median = median(Times))
## # A tibble: 5 × 3
## Destination average median
## <chr> <dbl> <dbl>
## 1 Los Angeles 596. 596.
## 2 Phoenix 2530. 2530.
## 3 San Diego 298. 298.
## 4 San Francisco 412. 412.
## 5 Seattle 1021 1021