NYC Flights Homework

Load the libraries and view the “flights” dataset

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.6
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
#view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
##                vars      n    mean      sd median trimmed     mad  min  max
## year              1 336776 2013.00    0.00   2013 2013.00    0.00 2013 2013
## month             2 336776    6.55    3.41      7    6.56    4.45    1   12
## day               3 336776   15.71    8.77     16   15.70   11.86    1   31
## dep_time          4 328521 1349.11  488.28   1401 1346.82  634.55    1 2400
## sched_dep_time    5 336776 1344.25  467.34   1359 1341.60  613.80  106 2359
## dep_delay         6 328521   12.64   40.21     -2    3.32    5.93  -43 1301
## arr_time          7 328063 1502.05  533.26   1535 1526.42  619.73    1 2400
## sched_arr_time    8 336776 1536.38  497.46   1556 1550.67  618.24    1 2359
## arr_delay         9 327346    6.90   44.63     -5   -1.03   20.76  -86 1272
## carrier*         10 336776    7.14    4.14      6    7.00    5.93    1   16
## flight           11 336776 1971.92 1632.47   1496 1830.51 1608.62    1 8500
## tailnum*         12 334264 1814.32 1199.75   1798 1778.21 1587.86    1 4043
## origin*          13 336776    1.95    0.82      2    1.94    1.48    1    3
## dest*            14 336776   50.03   28.12     50   49.56   32.62    1  105
## air_time         15 327346  150.69   93.69    129  140.03   75.61   20  695
## distance         16 336776 1039.91  733.23    872  955.27  569.32   17 4983
## hour             17 336776   13.18    4.66     13   13.15    5.93    1   23
## minute           18 336776   26.23   19.30     29   25.64   23.72    0   59
## time_hour        19 336776     NaN      NA     NA     NaN      NA  Inf -Inf
##                range  skew kurtosis   se
## year               0   NaN      NaN 0.00
## month             11 -0.01    -1.19 0.01
## day               30  0.01    -1.19 0.02
## dep_time        2399 -0.02    -1.09 0.85
## sched_dep_time  2253 -0.01    -1.20 0.81
## dep_delay       1344  4.80    43.95 0.07
## arr_time        2399 -0.47    -0.19 0.93
## sched_arr_time  2358 -0.35    -0.38 0.86
## arr_delay       1358  3.72    29.23 0.08
## carrier*          15  0.36    -1.21 0.01
## flight          8499  0.66    -0.85 2.81
## tailnum*        4042  0.17    -1.24 2.08
## origin*            2  0.09    -1.50 0.00
## dest*            104  0.13    -1.08 0.05
## air_time         675  1.07     0.86 0.16
## distance        4966  1.13     1.19 1.26
## hour              22  0.00    -1.21 0.01
## minute            59  0.09    -1.24 0.03
## time_hour       -Inf    NA       NA   NA

Now create one data visualization with this dataset

Use the dataset NYCFlights13 to create a bargraph of Top 10 airlines with the most delays

library(nycflights13)
library(ggplot2)
flights_nona <- flights %>%
  filter(!is.na(year) & !is.na(month) & !is.na(day) & !is.na(sched_dep_time) & !is.na(arr_time) & !is.na(sched_arr_time) & !is.na(hour) & !is.na(minute) & !is.na(time_hour))   # remove observations with NA values 
delays <- flights_nona %>%             # create a delays dataframe by:
  group_by (carrier) %>%               # grouping by carrier
  summarize (count = n(),              # creating variables: number of flights to each destination,
             dist = mean (distance),   # the mean distance flown to each destination,
             delay = mean (arr_delay), # the mean delay of arrival to each destination,
             delaycost = mean(count*delay/dist)) # delay cost index defined as:
                                       #  [(number of flights)*delay/distance] for a destination
delays <- arrange(delays, desc(count)) # sort the rows by count
head(delays)                           # look at the data
## # A tibble: 6 x 5
##   carrier count  dist delay delaycost
##   <chr>   <int> <dbl> <dbl>     <dbl>
## 1 UA      57916 1532.    NA        NA
## 2 B6      54137 1070.    NA        NA
## 3 EV      51251  563.    NA        NA
## 4 DL      47746 1238.    NA        NA
## 5 AA      32059 1343.    NA        NA
## 6 MQ      25076  571.    NA        NA

This gives United Airline (UA) with the most frequent delays.

Now create the bargraph.

delays_carrier <- delays %>%                        # create a delays_carrier dataframe by:
  ggplot() +                                      
  geom_bar(aes(x=carrier, y=count, fill=carrier),   # creating a bargraph
      position = "dodge", stat = "identity") +
  ggtitle("Airlines with the most Delays in 2013") +
  ylab("Number of Delays") + 
  labs(fill = "Carrier")
delays_carrier                                      # loot at the graph

Now get the top 10 Airlines bargraph.

top10 <- delays %>%
  filter(carrier =="UA" | carrier =="B6" | carrier == "EV" | carrier == "DL" | carrier == "AA" | carrier == "MQ" | carrier == "9E" | carrier == "US" | carrier == "WN" | carrier == "VX") %>%
  ggplot() +
  geom_bar(aes(x= carrier, y=count, fill = carrier),
  position = "dodge", stat = "identity") +
  labs(ylab = "Number of Delays",
    title = "Top 10 Airlines with Most Frequent Delays in 2013",
    fill = "Carrier")
top10

What did this bargraph show?

This bargraph shows the status of delays for the top 10 airlines in 2013. We can see that UA airline is the most frequent delays carrier, followed by B6, DL, and EV carriers not far behind. VX seems to be the carrier that we can tell was the least delayed, though the carriers in the other category would be delayed more than a VX. Also, airlines with high delays appear to have been delayed 45000 to 58000 times, and since they are UA, B6, EV, and DL, these airlines should be avoided.

Thank you :)