library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
library(fueleconomy)
## Warning: package 'fueleconomy' was built under R version 4.0.4
#find the average the delay of arrivals at airports
flights_delay <- flights %>%
group_by(dest) %>%
#summarize average arrival delays
summarise(Avg_delay = mean(arr_delay, na.rm = TRUE)) %>%
#join airports table to get the location of the airports
inner_join(airports, by = c("dest" = "faa"))
#Graph location of destination airports on a map and flight delays by using a color gradient
flights_delay %>%
ggplot(aes(lon, lat, colour = Avg_delay)) +
borders("state") +
geom_point() +
coord_quickmap()+
ylab("latitude")+
xlab("longitude")+
ggtitle("Average Delay by Arrival Times")
#subset the faa, lon, and lat variable from the airports table
location <- airports %>%
select(faa, lon, lat)
flights %>%
#join the location table to get the origin and destination longitude and latitude locations
left_join(location, by = c("origin" = "faa")) %>%
left_join(location, by = c("dest" = "faa"), suffix = c("_origin", "_dest"))
## # A tibble: 336,776 x 23
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ... with 336,766 more rows, and 15 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # lon_origin <dbl>, lat_origin <dbl>, lon_dest <dbl>, lat_dest <dbl>
#create table to with age of the planes
age <- planes %>%
#create a new variable called age by calculating the difference of the year 2013 by the values in the year column
mutate(age = 2013 - year)
flights_delay_age <- flights %>%
#join age table
left_join(age, by = "tailnum") %>%
#filter out planes with an age of 0
filter(age != 0)%>%
group_by(age) %>%
#summarize average departure delay by the age of the planes
summarise(Avg_delay = mean(dep_delay, na.rm = TRUE)) %>%
ungroup()
#graph the table of the age of the plane by the average delay
flights_delay_age %>%
ggplot(aes( x =age, y = Avg_delay )) +
geom_point() +
ylab("Average Departure Delay")+
xlab("Age of Plane")+
ggtitle("Affect of Age on Departure Delays")
There is no correlation between the age of the plane the average delays it experiences.
#
flight_weather <- flights %>%
#join weather table
left_join(weather, by = "time_hour", suffix = c("_flight", "_weather")) %>%
group_by(visib) %>%
#summarize average depature delay by level of visibilty
summarise(Avg_dep_delay = mean(dep_delay, na.rm = TRUE))
#Graph
flight_weather %>%
ggplot(aes(x = visib, y = Avg_dep_delay)) +
geom_line()+
ylab("Average Departure Delay")+
xlab("Visibility")+
ggtitle("Correlation Between Visibility and Average Departure Delays")
## Warning: Removed 1 row(s) containing missing values (geom_path).
Higher delays were experienced during low Visibility.
#create table to visiulize weather affects on delays in 6/13/2013
june <- flights %>%
#filter for the date
filter( month == 6 & day == 13 & year == 2013) %>%
group_by(dest) %>%
#summarize average delay by destination
summarise(Avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
#join airports table
inner_join(airports, by = c("dest" = "faa"))
#Graph map of delays
june %>%
ggplot(aes(y = lat, x = lon, color = Avg_arr_delay)) +
borders("state") +
geom_point() +
coord_quickmap()+
ylab("latitude")+
xlab("longitude")+
ggtitle("Average Delay by Arrival Times on 6/13/2013")
The midwestern and southeastern United States were affected by two derecho thunderstorms.
flights %>%
#anti_join planes table on tail number to remove planes with tail numbers from the flights table
anti_join(planes, by = "tailnum") %>%
#count carrier that do have planes without a tail number
count(carrier, sort = TRUE)
## # A tibble: 10 x 2
## carrier n
## <chr> <int>
## 1 MQ 25397
## 2 AA 22558
## 3 UA 1693
## 4 9E 1044
## 5 B6 830
## 6 US 699
## 7 FL 187
## 8 DL 110
## 9 F9 50
## 10 WN 38
flights %>%
#filter for flights that do not have a tail number
filter(is.na(tailnum))
## # A tibble: 2,512 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 2 NA 1545 NA NA 1910
## 2 2013 1 2 NA 1601 NA NA 1735
## 3 2013 1 3 NA 857 NA NA 1209
## 4 2013 1 3 NA 645 NA NA 952
## 5 2013 1 4 NA 845 NA NA 1015
## 6 2013 1 4 NA 1830 NA NA 2044
## 7 2013 1 5 NA 840 NA NA 1001
## 8 2013 1 7 NA 820 NA NA 958
## 9 2013 1 8 NA 1645 NA NA 1838
## 10 2013 1 9 NA 755 NA NA 1012
## # ... with 2,502 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
The planes with missing tail numbers have missing departure and arrival times. It can be assumed that the planes without tail numbers were grounded and didn’t leave the origin airport. American Airways (AA) and Envoy Air (MQ) have a higher number of planes without a tail number because they report fleet numbers rather than tail numbers.
#find planes that conducted at least 100 flights
FLT_100<- flights %>%
group_by(tailnum) %>%
#count tail number to count the number of flights a plane has conducted
count(tailnum) %>%
#filter for flights completed 100 or more flights
filter(n >= 100)
flights %>%
#semi join FLT_100 table to remove planes that have had less than 100 flights from the flights table
semi_join(FLT_100, by = "tailnum")
## # A tibble: 230,902 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 544 545 -1 1004 1022
## 4 2013 1 1 554 558 -4 740 728
## 5 2013 1 1 555 600 -5 913 854
## 6 2013 1 1 557 600 -3 709 723
## 7 2013 1 1 557 600 -3 838 846
## 8 2013 1 1 558 600 -2 849 851
## 9 2013 1 1 558 600 -2 853 856
## 10 2013 1 1 558 600 -2 923 937
## # ... with 230,892 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
vehicles %>%
#semi join the common table keep only the common vehicles in the vehicles table
semi_join(common)
## Joining, by = c("make", "model")
## # A tibble: 14,531 x 12
## id make model year class trans drive cyl displ fuel hwy cty
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 1833 Acura Integ~ 1986 Subcom~ Autom~ Front-~ 4 1.6 Regu~ 28 22
## 2 1834 Acura Integ~ 1986 Subcom~ Manua~ Front-~ 4 1.6 Regu~ 28 23
## 3 3037 Acura Integ~ 1987 Subcom~ Autom~ Front-~ 4 1.6 Regu~ 28 22
## 4 3038 Acura Integ~ 1987 Subcom~ Manua~ Front-~ 4 1.6 Regu~ 28 23
## 5 4183 Acura Integ~ 1988 Subcom~ Autom~ Front-~ 4 1.6 Regu~ 27 22
## 6 4184 Acura Integ~ 1988 Subcom~ Manua~ Front-~ 4 1.6 Regu~ 28 23
## 7 5303 Acura Integ~ 1989 Subcom~ Autom~ Front-~ 4 1.6 Regu~ 27 22
## 8 5304 Acura Integ~ 1989 Subcom~ Manua~ Front-~ 4 1.6 Regu~ 28 23
## 9 6442 Acura Integ~ 1990 Subcom~ Autom~ Front-~ 4 1.8 Regu~ 24 20
## 10 6443 Acura Integ~ 1990 Subcom~ Manua~ Front-~ 4 1.8 Regu~ 26 21
## # ... with 14,521 more rows
two_day <-
flights %>%
group_by(month, day) %>%
#Summarize total delay by month and day
summarize(tot_delay = sum(arr_delay + dep_delay, na.rm = TRUE)) %>%
#create a variable for a two day delay by using the total delay from the summarize function and the lag function to add the delay for two consecutive days
mutate(twoday = tot_delay + lag(tot_delay)) %>%
arrange(desc(twoday))
## `summarise()` has grouped output by 'month'. You can override using the `.groups` argument.
weather2 <-
weather %>%
group_by(month, day) %>%
#summarize the mean of the weather conditions by month and day
summarize(across(c(humid, precip, temp, visib), .fns = mean, na.rm = TRUE))
## `summarise()` has grouped output by 'month'. You can override using the `.groups` argument.
two_day_weather <-
two_day %>%
#join the weather2 table with the two_day table
left_join(weather2) %>%
#arrange by the two day delay time in descending order
arrange(desc(twoday))
## Joining, by = c("month", "day")
#create graph to show the affect of weather conditions on delay times of 2 consecutive days
two_day_weather %>%
ggplot(aes(x= twoday, y = humid, color = visib, size = precip))+
geom_point() +
ylab("Relative Humidity") +
xlab("Total 48 Hour Delay Between Consecutive Days") +
ggtitle("Weather Affects on Flight Delays Within 48 Hours")
## Warning: Removed 13 rows containing missing values (geom_point).
The worst forty-eight-hour delay was recorded on July 23, 2013. Forty-eight-hour delays seem to have a positive correlation with relative humidity.
flights %>%
#anti join airports table
anti_join(airports, by = c("dest" = "faa"))
## # A tibble: 7,602 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 544 545 -1 1004 1022
## 2 2013 1 1 615 615 0 1039 1100
## 3 2013 1 1 628 630 -2 1137 1140
## 4 2013 1 1 701 700 1 1123 1154
## 5 2013 1 1 711 715 -4 1151 1206
## 6 2013 1 1 820 820 0 1254 1310
## 7 2013 1 1 820 820 0 1249 1329
## 8 2013 1 1 840 845 -5 1311 1350
## 9 2013 1 1 909 810 59 1331 1315
## 10 2013 1 1 913 918 -5 1346 1416
## # ... with 7,592 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
It will provide flight destinations codes that do not have a name in the airports table.
airports %>%
#anti join flights table
anti_join(flights, by = c("faa" = "dest"))
## # A tibble: 1,357 x 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/New_Yo~
## 2 06A Moton Field Municipal A~ 32.5 -85.7 264 -6 A America/Chicago
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/Chicago
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/New_Yo~
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/New_Yo~
## 6 0A9 Elizabethton Municipal ~ 36.4 -82.2 1593 -5 A America/New_Yo~
## 7 0G6 Williams County Airport 41.5 -84.5 730 -5 A America/New_Yo~
## 8 0G7 Finger Lakes Regional A~ 42.9 -76.8 492 -5 A America/New_Yo~
## 9 0P2 Shoestring Aviation Air~ 39.8 -76.6 1000 -5 U America/New_Yo~
## 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/Los_An~
## # ... with 1,347 more rows
It will provide names of airports that the flights table didn’t have as a destination.
planes_carriers <-
flights %>%
#filter for planes that do not have a missing tail number
filter(!is.na(tailnum)) %>%
#get distinct pairs of tail number and carrier combinations
distinct(tailnum, carrier)
tail_count <- planes_carriers %>%
#count the tail numbers
count(tailnum, sort = TRUE) %>%
#filter the count for values greater than 1
filter(n>1)
planes_carriers %>%
#semi join tail_count to keep only planes that had a value greater than one signifying that have had more than one owner
semi_join(tail_count, by = "tailnum") %>%
#left join planes to get the details of the planes
left_join(planes, by = "tailnum") %>%
#arrange by tailnum to see the different carriers that owned the same plane
arrange(tailnum)
## # A tibble: 34 x 10
## carrier tailnum year type manufacturer model engines seats speed engine
## <chr> <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
## 1 9E N146PQ 2007 Fixed w~ BOMBARDIER I~ CL-6~ 2 95 NA Turbo~
## 2 EV N146PQ 2007 Fixed w~ BOMBARDIER I~ CL-6~ 2 95 NA Turbo~
## 3 9E N153PQ 2007 Fixed w~ BOMBARDIER I~ CL-6~ 2 95 NA Turbo~
## 4 EV N153PQ 2007 Fixed w~ BOMBARDIER I~ CL-6~ 2 95 NA Turbo~
## 5 9E N176PQ 2008 Fixed w~ BOMBARDIER I~ CL-6~ 2 95 NA Turbo~
## 6 EV N176PQ 2008 Fixed w~ BOMBARDIER I~ CL-6~ 2 95 NA Turbo~
## 7 9E N181PQ 2008 Fixed w~ BOMBARDIER I~ CL-6~ 2 95 NA Turbo~
## 8 EV N181PQ 2008 Fixed w~ BOMBARDIER I~ CL-6~ 2 95 NA Turbo~
## 9 9E N197PQ 2008 Fixed w~ BOMBARDIER I~ CL-6~ 2 95 NA Turbo~
## 10 EV N197PQ 2008 Fixed w~ BOMBARDIER I~ CL-6~ 2 95 NA Turbo~
## # ... with 24 more rows
Seventeen planes have been used by more than one carrier.