Load the libraries and view the “flights” dataset

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(ggplot2)
view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
##                vars      n    mean      sd median trimmed     mad  min  max
## year              1 336776 2013.00    0.00   2013 2013.00    0.00 2013 2013
## month             2 336776    6.55    3.41      7    6.56    4.45    1   12
## day               3 336776   15.71    8.77     16   15.70   11.86    1   31
## dep_time          4 328521 1349.11  488.28   1401 1346.82  634.55    1 2400
## sched_dep_time    5 336776 1344.25  467.34   1359 1341.60  613.80  106 2359
## dep_delay         6 328521   12.64   40.21     -2    3.32    5.93  -43 1301
## arr_time          7 328063 1502.05  533.26   1535 1526.42  619.73    1 2400
## sched_arr_time    8 336776 1536.38  497.46   1556 1550.67  618.24    1 2359
## arr_delay         9 327346    6.90   44.63     -5   -1.03   20.76  -86 1272
## carrier*         10 336776    7.14    4.14      6    7.00    5.93    1   16
## flight           11 336776 1971.92 1632.47   1496 1830.51 1608.62    1 8500
## tailnum*         12 334264 1814.32 1199.75   1798 1778.21 1587.86    1 4043
## origin*          13 336776    1.95    0.82      2    1.94    1.48    1    3
## dest*            14 336776   50.03   28.12     50   49.56   32.62    1  105
## air_time         15 327346  150.69   93.69    129  140.03   75.61   20  695
## distance         16 336776 1039.91  733.23    872  955.27  569.32   17 4983
## hour             17 336776   13.18    4.66     13   13.15    5.93    1   23
## minute           18 336776   26.23   19.30     29   25.64   23.72    0   59
## time_hour        19 336776     NaN      NA     NA     NaN      NA  Inf -Inf
##                range  skew kurtosis   se
## year               0   NaN      NaN 0.00
## month             11 -0.01    -1.19 0.01
## day               30  0.01    -1.19 0.02
## dep_time        2399 -0.02    -1.09 0.85
## sched_dep_time  2253 -0.01    -1.20 0.81
## dep_delay       1344  4.80    43.95 0.07
## arr_time        2399 -0.47    -0.19 0.93
## sched_arr_time  2358 -0.35    -0.38 0.86
## arr_delay       1358  3.72    29.23 0.08
## carrier*          15  0.36    -1.21 0.01
## flight          8499  0.66    -0.85 2.81
## tailnum*        4042  0.17    -1.24 2.08
## origin*            2  0.09    -1.50 0.00
## dest*            104  0.13    -1.08 0.05
## air_time         675  1.07     0.86 0.16
## distance        4966  1.13     1.19 1.26
## hour              22  0.00    -1.21 0.01
## minute            59  0.09    -1.24 0.03
## time_hour       -Inf    NA       NA   NA
names(flights)
##  [1] "year"           "month"          "day"            "dep_time"      
##  [5] "sched_dep_time" "dep_delay"      "arr_time"       "sched_arr_time"
##  [9] "arr_delay"      "carrier"        "flight"         "tailnum"       
## [13] "origin"         "dest"           "air_time"       "distance"      
## [17] "hour"           "minute"         "time_hour"
str(flights)
## tibble [336,776 x 19] (S3: tbl_df/tbl/data.frame)
##  $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
##  $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
##  $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
NYCFlights <- na.omit(flights) 
filter(NYCFlights, dep_delay > 1000)
## # A tibble: 5 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     9      641            900      1301     1242           1530
## 2  2013     1    10     1121           1635      1126     1239           1810
## 3  2013     6    15     1432           1935      1137     1607           2120
## 4  2013     7    22      845           1600      1005     1044           1815
## 5  2013     9    20     1139           1845      1014     1457           2210
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
dep_delay <- filter(NYCFlights, dep_delay > 1000)  
arrange(dep_delay, desc(dep_delay))
## # A tibble: 5 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     9      641            900      1301     1242           1530
## 2  2013     6    15     1432           1935      1137     1607           2120
## 3  2013     1    10     1121           1635      1126     1239           1810
## 4  2013     9    20     1139           1845      1014     1457           2210
## 5  2013     7    22      845           1600      1005     1044           1815
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
summarise(dep_delay, mean_dep_delay = mean(dep_delay))
## # A tibble: 1 x 1
##   mean_dep_delay
##            <dbl>
## 1          1117.
NYCFlights <- NYCFlights[NYCFlights$dep_delay >= 0,] 
mutate(NYCFlights, speed = distance / air_time * 60)
## # A tibble: 144,211 x 20
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      559            559         0      702            706
##  5  2013     1     1      600            600         0      851            858
##  6  2013     1     1      600            600         0      837            825
##  7  2013     1     1      601            600         1      844            850
##  8  2013     1     1      607            607         0      858            915
##  9  2013     1     1      608            600         8      807            735
## 10  2013     1     1      611            600        11      945            931
## # ... with 144,201 more rows, and 12 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   speed <dbl>
#ggplot(NYCFlights, aes(x = dep_delay)) +
 # geom_histogram()+
summary(NYCFlights$dep_delay)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    3.00   15.00   34.76   44.00 1301.00
summary(NYCFlights$arr_delay)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -70.00   -4.00   13.00   30.03   45.00 1272.00
NYCFlights %>% 
  filter(dep_delay %in% seq(3, 44))
## # A tibble: 77,831 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      533            529         4      850            830
##  2  2013     1     1      608            600         8      807            735
##  3  2013     1     1      611            600        11      945            931
##  4  2013     1     1      613            610         3      925            921
##  5  2013     1     1      623            610        13      920            915
##  6  2013     1     1      632            608        24      740            728
##  7  2013     1     1      644            636         8      931            940
##  8  2013     1     1      709            700         9      852            832
##  9  2013     1     1      732            729         3     1041           1039
## 10  2013     1     1      743            730        13     1107           1100
## # ... with 77,821 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
df <- NYCFlights %>% 
  drop_na(dep_delay)
length(seq(3,44))
## [1] 42
str(df)
## tibble [144,211 x 19] (S3: tbl_df/tbl/data.frame)
##  $ year          : int [1:144211] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int [1:144211] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int [1:144211] 1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int [1:144211] 517 533 542 559 600 600 601 607 608 611 ...
##  $ sched_dep_time: int [1:144211] 515 529 540 559 600 600 600 607 600 600 ...
##  $ dep_delay     : num [1:144211] 2 4 2 0 0 0 1 0 8 11 ...
##  $ arr_time      : int [1:144211] 830 850 923 702 851 837 844 858 807 945 ...
##  $ sched_arr_time: int [1:144211] 819 830 850 706 858 825 850 915 735 931 ...
##  $ arr_delay     : num [1:144211] 11 20 33 -4 -7 12 -6 -17 32 14 ...
##  $ carrier       : chr [1:144211] "UA" "UA" "AA" "B6" ...
##  $ flight        : int [1:144211] 1545 1714 1141 1806 371 4650 343 1077 3768 303 ...
##  $ tailnum       : chr [1:144211] "N14228" "N24211" "N619AA" "N708JB" ...
##  $ origin        : chr [1:144211] "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr [1:144211] "IAH" "IAH" "MIA" "BOS" ...
##  $ air_time      : num [1:144211] 227 227 160 44 152 134 147 157 139 366 ...
##  $ distance      : num [1:144211] 1400 1416 1089 187 1076 ...
##  $ hour          : num [1:144211] 5 5 5 5 6 6 6 6 6 6 ...
##  $ minute        : num [1:144211] 15 29 40 59 0 0 0 7 0 0 ...
##  $ time_hour     : POSIXct[1:144211], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
##  - attr(*, "na.action")= 'omit' Named int [1:9430] 472 478 616 644 726 734 755 839 840 841 ...
##   ..- attr(*, "names")= chr [1:9430] "472" "478" "616" "644" ...
ggplot(data = df, aes(x = dep_delay)) +
  geom_histogram(binwidth = 15)

Here we want to focus on the delays of flights headed to BWI only, we need to first filter the data for for flights headed to JFK (dest == “BWI”) and then make a histogram of only departure delays of only those flights

bwi_Dec_flights <- df %>% 
  filter(dest == "BWI", month == 12)
ggplot(bwi_Dec_flights, aes(x = dep_delay, y = day, fill = dep_delay) ) +
  geom_line(aes(col="red"), stats = "identity" ) +
  labs(col = "elephant") +
labs(x = "Departure Delay",
y="December 2013",
title = "Graph of delayed flights in December")
## Warning: Ignoring unknown parameters: stats

This is a representation in form of a line graph showing the variation of departure delay flights for the month of December 2013 in New York City.

bwi_Dec_flights %>% 
  summarise(mean_dd = mean(dep_delay), sd_dd = sd(dep_delay), n = n())
## # A tibble: 1 x 3
##   mean_dd sd_dd     n
##     <dbl> <dbl> <int>
## 1    42.1  48.7    41
 ggplot(data = df, aes(x = origin, fill = dep_delay)) +
  geom_bar(stats = "identity" ,col = "yellow")
## Warning: Ignoring unknown parameters: stats

 NYCFlights %>% 
  select(arr_delay ,dep_delay, month, hour,air_time, carrier, dest) %>% 
 filter((arr_delay == 0 & dep_delay == 0), month == 7, (hour >=10 & hour <= 18), (air_time >= 120 & air_time <= 240), dest == "SJU") %>% 
  arrange(desc(air_time))
## # A tibble: 2 x 7
##   arr_delay dep_delay month  hour air_time carrier dest 
##       <dbl>     <dbl> <int> <dbl>    <dbl> <chr>   <chr>
## 1         0         0     7    12      199 UA      SJU  
## 2         0         0     7    10      199 B6      SJU
Most_dep_delay <- NYCFlights %>% 
  select(dep_delay, carrier,dest, month) %>% 
  filter(month == 12, dep_delay >=60, dest == "BWI") %>% 
  arrange(desc(dep_delay)) %>% 
head(10)
Most_dep_delay
## # A tibble: 10 x 4
##    dep_delay carrier dest  month
##        <dbl> <chr>   <chr> <int>
##  1       212 MQ      BWI      12
##  2       173 MQ      BWI      12
##  3       120 EV      BWI      12
##  4       114 EV      BWI      12
##  5       107 9E      BWI      12
##  6        94 MQ      BWI      12
##  7        87 EV      BWI      12
##  8        86 EV      BWI      12
##  9        83 EV      BWI      12
## 10        67 EV      BWI      12
Most_dep_delay %>% 
  ggplot(Most_dep_delay, mapping = aes(x = carrier, Y = dep_delay,fill = carrier )) +
  geom_bar(stats = "identity" ,col = "blue") +
labs(x = "carrier", y="Departure delay more than 60mins",
title = "Bargraph showing carriers with more than one hour dep_delay time")
## Warning: Ignoring unknown parameters: stats

The above Bargraph shows the three carriers with departure delay time of more than one hour in the month of December. This is a representation of the Carriers passengers or travelers should avoid because not only do they have delays but these delays last as long as one hour or more in a very busy holiday period when people are trying to get hope for family reunions.

NYCFlights %>% 
  select(arr_time, origin, dest, distance, carrier, tailnum) %>% 
  filter( dest == "HNL", distance >= 4000)
## # A tibble: 318 x 6
##    arr_time origin dest  distance carrier tailnum
##       <int> <chr>  <chr>    <dbl> <chr>   <chr>  
##  1     2005 EWR    HNL       4963 UA      N76065 
##  2     1525 JFK    HNL       4983 HA      N380HA 
##  3     1940 EWR    HNL       4963 UA      N77066 
##  4     1504 JFK    HNL       4983 HA      N380HA 
##  5     2006 EWR    HNL       4963 UA      N76064 
##  6     1516 JFK    HNL       4983 HA      N384HA 
##  7     1932 EWR    HNL       4963 UA      N76065 
##  8     1558 JFK    HNL       4983 HA      N385HA 
##  9     1927 EWR    HNL       4963 UA      N69063 
## 10     1620 JFK    HNL       4983 HA      N385HA 
## # ... with 308 more rows
 Avg_speed_WRT_distance <- NYCFlights %>% 
  select(arr_time, origin, dest, distance) 
Avg_speed_WRT_distance %>% 
  mutate(avg_speed = distance / (arr_time / 60)) %>% 
ggplot(Avg_speed_WRT_distance, mapping=  aes(distance, avg_speed)) + 
  geom_point(col = 'red') +
  ggtitle("Scatterplot of Average speed versus Distance") 

The above scatterplot is a representation of the relationship between distance per flight and the average speed of the said flight from New York to the rest of the country. Our data didn’t have a variable for average speed so we needed to mutate the data to include average speed which was gotten from from dividing the total distance per flight over the arrival time and multiplying by 60 minutes. After using our select and filter functions above, we noticed that the longest flight New york city is to Honolulu (HNL) and the shortest is to Philadelphia (PHL). There is a direct relationship between distance and average speed up to a certain point when it changes because of the few outliers which tells us that the distance is out of mainland USA.