library(nycflights13)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
airports%>%
semi_join(flights, c("faa" = "dest")) %>%
ggplot(aes(lon, lat)) +
borders("state") +
geom_point() +
coord_quickmap()

## I will be analyzing the data for flights and decided to map out the different airports and destinations.
airports2=airports%>% semi_join(flights, c("faa" = "dest"))
airportsplot=airports2%>%ggplot(aes(lon, lat))+ borders("state") + geom_point()+ coord_quickmap()
print(airportsplot)

# flights %>%
# left_join(select(airports, faa, lat, lon), by = c("origin" = "faa")) %>%
# rename(lat_origin = lat,
# lon_origin = lon) %>%
# left_join(select(airports, faa, lat, lon), by = c("dest" = "faa")) %>%
# rename(lat_dest = lat,
# lon_dest = lon) %>%
# select(origin, dest, matches("lat|lon"))
## I decided to use left-join to merge all rows from airports table and any matching rows from the second table. This will facilitate the analysis of the data as it will be in a cleaner format. Furthermore, I renamed the lat_dest to a simpler term lat and lon_dest to lon.
flights %>%
mutate(tot_delay = arr_delay + dep_delay) %>%
group_by(tailnum) %>%
summarize(avg_delay = mean(tot_delay, na.rm = TRUE)) %>%
left_join(select(planes, tailnum, year), by = "tailnum") %>%
mutate(year = 2013 - year) %>%
ggplot(aes(avg_delay, year)) +
geom_point()
## Warning: Removed 798 rows containing missing values (geom_point).

## I used a mutating joint to combine the variables arr_delay and dep_delay. Furthermore, the group_by function sorted out all the tailnumbers. In addition, the left_join merged all the data from planes, tailnum, year by tailnum.
# There is no relationship between the age of a plane and its delays. As can be seen, new plans have more significant delays compared to older planes, and the reuslts are all varied. Therefore, delays are sporadic and age of the plane has a limited impact on them.
avg_del =
flights %>%
mutate(tot_delay = arr_delay + dep_delay) %>%
group_by(month, day) %>%
summarize(avg_delay = mean(tot_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'month'. You can override using the `.groups` argument.
avg_weather =
weather %>%
group_by(month, day) %>%
select(-hour) %>%
summarize_at(vars(temp, humid, wind_speed, precip), mean, na.rm = TRUE)
avg_del %>%
left_join(avg_weather) %>%
ungroup() %>%
mutate(avg_delay = cut_width(avg_delay, 30)) %>%
gather(weather, metrics, -(month:avg_delay)) %>%
ggplot(aes(avg_delay, metrics)) +
geom_boxplot() +
facet_wrap(~ weather, scales = "free_y")
## Joining, by = c("month", "day")
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

## I decided to first summarize the average delay data and total delay, and removed the N/A values. Once done, I took the average weather data, grouped it by month and day, and removed specific hours. This helped me summarize the data of temp, humid, wind, and precipitation. Then I used left_join to merge the average weather data with the average_delay. Once done, I gathered the data and left out the months of the average delay, and created a box plot representation the different weather phenomenon.
# The graphs demonstrate that high humidty levels are mostly responsible for major delays. Furthermore, the medium of temperatures demonstrates that they may cause delays as well. Also, wind speed causes minor delays for the most part as well as rain. I believe that humidity affects the mechanics of the plan along side extreme temperatures, which may lead to delays for safety reasons.
flights %>%
mutate(tot_delay = arr_delay + dep_delay) %>%
group_by(month, day, dest) %>%
summarize(avg_delay = mean(tot_delay, na.rm = TRUE)) %>%
filter(month == 6, day == 13) %>%
left_join(select(airports, faa, lat, lon), by = c("dest" = "faa")) %>%
ggplot(aes(lon, lat, colour = avg_delay)) +
borders("state") +
geom_point(size = 3, alpha = 1) +
xlim(c(-130, -65)) +
ylim(c(20, 50)) +
coord_quickmap() +
viridis::scale_color_viridis()
## `summarise()` has grouped output by 'month', 'day'. You can override using the `.groups` argument.
## Warning: Removed 5 rows containing missing values (geom_point).

## I transferred the initial code (lines 87-90) from my previous chunk, and filtered the date that we are looking for June 13, 2013. For line 92-94, I friend helped me make the appropriate combination of columns by using lef_join and divide the map into state borders. Then the x axis served as the longitudinal and y as lateral, which facilitated the making of the map. Lastly, I used coord_quickmap and the viridis package to create a colored scale with the varying time delays. After researching the delay with weather references I found out that there was a major storm in the East Coast during that day, which lead to the cancelation of the flight.
flights %>%
anti_join(planes, by = "tailnum") %>%
count(carrier, sort = TRUE)
## # A tibble: 10 x 2
## carrier n
## <chr> <int>
## 1 MQ 25397
## 2 AA 22558
## 3 UA 1693
## 4 9E 1044
## 5 B6 830
## 6 US 699
## 7 FL 187
## 8 DL 110
## 9 F9 50
## 10 WN 38
##The carriers MQ and AA do not report there corresponding tail numbers as can be seen by the count.
flights %>%
semi_join(count(flights, tailnum) %>%
filter(n >= 100))
## Joining, by = "tailnum"
## # A tibble: 230,902 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 544 545 -1 1004 1022
## 4 2013 1 1 554 558 -4 740 728
## 5 2013 1 1 555 600 -5 913 854
## 6 2013 1 1 557 600 -3 709 723
## 7 2013 1 1 557 600 -3 838 846
## 8 2013 1 1 558 600 -2 849 851
## 9 2013 1 1 558 600 -2 853 856
## 10 2013 1 1 558 600 -2 923 937
## # … with 230,892 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
ten_common <-
fueleconomy::common %>%
ungroup() %>%
arrange(-n) %>%
top_n(10, n)
fueleconomy::vehicles %>%
semi_join(ten_common)
## Joining, by = c("make", "model")
## # A tibble: 1,553 x 12
## id make model year class trans drive cyl displ fuel hwy cty
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 1182 Chevr… Camaro 1985 Subcom… Autom… Rear-… 6 2.8 Regu… 24 16
## 2 1183 Chevr… Camaro 1985 Subcom… Manua… Rear-… 6 2.8 Regu… 24 15
## 3 1184 Chevr… Camaro 1985 Subcom… Autom… Rear-… 8 5 Regu… 20 15
## 4 62 Chevr… Camaro 1985 Subcom… Manua… Rear-… 4 2.5 Regu… 30 19
## 5 63 Chevr… Camaro 1985 Subcom… Autom… Rear-… 6 2.8 Regu… 24 16
## 6 64 Chevr… Camaro 1985 Subcom… Manua… Rear-… 6 2.8 Regu… 25 16
## 7 65 Chevr… Camaro 1985 Subcom… Autom… Rear-… 8 5 Regu… 20 14
## 8 66 Chevr… Camaro 1985 Subcom… Manua… Rear-… 8 5 Regu… 22 14
## 9 67 Chevr… Camaro 1985 Subcom… Autom… Rear-… 8 5 Regu… 20 15
## 10 1755 Chevr… Camaro 1986 Subcom… Manua… Rear-… 4 2.5 Regu… 29 21
## # … with 1,543 more rows
fn <-
flights %>%
group_by(month, day) %>%
summarize(avg_delay = sum(arr_delay + dep_delay, na.rm = TRUE)) %>%
mutate(twoday_delay = avg_delay + lag(avg_delay)) %>%
arrange(-twoday_delay)
## `summarise()` has grouped output by 'month'. You can override using the `.groups` argument.
wea <-
weather %>%
group_by(month, day) %>%
summarize_at(vars(humid, precip, temp), mean, na.rm = TRUE)
fn %>%
left_join(wea) %>%
arrange(twoday_delay)
## Joining, by = c("month", "day")
## # A tibble: 365 x 7
## # Groups: month [12]
## month day avg_delay twoday_delay humid precip temp
## <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9 6 -17611 -32992 45.5 0 64.1
## 2 10 2 -13615 -31911 58.9 0 73.3
## 3 9 7 -14153 -31764 50.5 0 68.7
## 4 9 5 -15381 -29234 48.8 0 71.9
## 5 9 8 -8526 -22679 51.2 0 73.3
## 6 9 18 -9503 -19423 55.4 0 61.2
## 7 9 9 -10778 -19304 55.0 0 65.2
## 8 8 26 -8529 -16868 59.1 0.000417 75.5
## 9 8 27 -8109 -16638 66.3 0.000833 78.9
## 10 9 10 -5251 -16029 73.0 0 76.3
## # … with 355 more rows
## A friend assisted me with this code, and I concluded that humidity and precipation have a limited impact on the worst plane delays.
anti_join(flights, airports, by = c("dest" = "faa"))
## # A tibble: 7,602 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 544 545 -1 1004 1022
## 2 2013 1 1 615 615 0 1039 1100
## 3 2013 1 1 628 630 -2 1137 1140
## 4 2013 1 1 701 700 1 1123 1154
## 5 2013 1 1 711 715 -4 1151 1206
## 6 2013 1 1 820 820 0 1254 1310
## 7 2013 1 1 820 820 0 1249 1329
## 8 2013 1 1 840 845 -5 1311 1350
## 9 2013 1 1 909 810 59 1331 1315
## 10 2013 1 1 913 918 -5 1346 1416
## # … with 7,592 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
## Provides flight data that is not present in the airplane set.
anti_join(airports, flights, by = c("faa" = "dest"))
## # A tibble: 1,357 x 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/New_Yo…
## 2 06A Moton Field Municipal A… 32.5 -85.7 264 -6 A America/Chicago
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/Chicago
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/New_Yo…
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/New_Yo…
## 6 0A9 Elizabethton Municipal … 36.4 -82.2 1593 -5 A America/New_Yo…
## 7 0G6 Williams County Airport 41.5 -84.5 730 -5 A America/New_Yo…
## 8 0G7 Finger Lakes Regional A… 42.9 -76.8 492 -5 A America/New_Yo…
## 9 0P2 Shoestring Aviation Air… 39.8 -76.6 1000 -5 U America/New_Yo…
## 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/Los_An…
## # … with 1,347 more rows
## Do to the rearrangement of the data it provides information that is present in airplanes dataset, but not in the flight's data set.
flights %>%
group_by(tailnum, carrier) %>%
count() %>%
filter(n() > 1) %>%
select(tailnum) %>%
distinct(tailnum)
## Adding missing grouping variables: `carrier`
## # A tibble: 0 x 2
## # Groups: tailnum, carrier [0]
## # … with 2 variables: carrier <chr>, tailnum <chr>
## I accept this hypothesis as there are no planes with a sepcific number that are used or shared by more than one airline. Each airline has a specific number of planes in their fleet, which are only managed by them.