library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(nycflights13)
nycflights13::airlines
## # A tibble: 16 × 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
## 7 F9 Frontier Airlines Inc.
## 8 FL AirTran Airways Corporation
## 9 HA Hawaiian Airlines Inc.
## 10 MQ Envoy Air
## 11 OO SkyWest Airlines Inc.
## 12 UA United Air Lines Inc.
## 13 US US Airways Inc.
## 14 VX Virgin America
## 15 WN Southwest Airlines Co.
## 16 YV Mesa Airlines Inc.
airlines_data <- airlines
airports_data <- airports
flights_data <- flights
planes_data <- planes
weather_data <- weather
head(airlines_data)
## # A tibble: 6 × 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
flights_data
## # A tibble: 336,776 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 533 529 4 850 830 20 UA
## 3 2013 1 1 542 540 2 923 850 33 AA
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6
## 5 2013 1 1 554 600 -6 812 837 -25 DL
## 6 2013 1 1 554 558 -4 740 728 12 UA
## 7 2013 1 1 555 600 -5 913 854 19 B6
## 8 2013 1 1 557 600 -3 709 723 -14 EV
## 9 2013 1 1 557 600 -3 838 846 -8 B6
## 10 2013 1 1 558 600 -2 753 745 8 AA
## # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
airlines
## # A tibble: 16 × 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
## 7 F9 Frontier Airlines Inc.
## 8 FL AirTran Airways Corporation
## 9 HA Hawaiian Airlines Inc.
## 10 MQ Envoy Air
## 11 OO SkyWest Airlines Inc.
## 12 UA United Air Lines Inc.
## 13 US US Airways Inc.
## 14 VX Virgin America
## 15 WN Southwest Airlines Co.
## 16 YV Mesa Airlines Inc.
airlines$name
## [1] "Endeavor Air Inc." "American Airlines Inc."
## [3] "Alaska Airlines Inc." "JetBlue Airways"
## [5] "Delta Air Lines Inc." "ExpressJet Airlines Inc."
## [7] "Frontier Airlines Inc." "AirTran Airways Corporation"
## [9] "Hawaiian Airlines Inc." "Envoy Air"
## [11] "SkyWest Airlines Inc." "United Air Lines Inc."
## [13] "US Airways Inc." "Virgin America"
## [15] "Southwest Airlines Co." "Mesa Airlines Inc."
library(ggplot2)
library(dplyr)
airlines_data <- airlines
airports_data <- airports
all_IAD <- flights %>%
filter(carrier == "UA")
ggplot(data = all_IAD, mapping = aes(x = dep_delay, y = arr_delay)) +
geom_jitter(width = 30, height = 30)
## Warning: Removed 883 rows containing missing values (`geom_point()`).
top_airports <- flights %>%
group_by(dest) %>%
summarise(num_flights = n()) %>%
arrange(desc(num_flights)) %>%
top_n(5)
## Selecting by num_flights
all_IAD <- flights %>%
filter(carrier == "UA")
ggplot(data = all_IAD, mapping = aes(x = dep_delay, y = arr_delay, color = dest)) +
geom_jitter(width = 30, height = 30) +
scale_color_discrete(name = "Destination Airport") +
labs(title = "Flights from UA Carrier Departing from IAD",
x = "Departure Delay (minutes)",
y = "Arrival Delay (minutes)")
## Warning: Removed 883 rows containing missing values (`geom_point()`).
top_airports <- flights %>%
group_by(dest) %>%
summarise(num_flights = n()) %>%
arrange(desc(num_flights)) %>%
top_n(5)
## Selecting by num_flights
flights_filtered <- flights %>%
filter(dest %in% top_airports$dest)
ggplot(data = flights_filtered, mapping = aes(x = dep_delay, y = arr_delay,color=dest)) +
geom_jitter(width = 30, height = 30) +
scale_color_discrete(name = "Destination Airport") +
labs(title = "Flights to Top 5 Busiest Airports",
x = "Departure Delay (minutes)",
y = "Arrival Delay (minutes)")
## Warning: Removed 1844 rows containing missing values (`geom_point()`).
I created a scatterplot that shows the relationship between arrival delay and depature delay. I first looked at it with all the airports and then picked the top 5 busiest airports. One aspect I would like to highlight is that there appears to be a positive correlation between depature delay and arrival delay. What influences these factors