library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(ggplot2)
view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed mad min max
## year 1 336776 2013.00 0.00 2013 2013.00 0.00 2013 2013
## month 2 336776 6.55 3.41 7 6.56 4.45 1 12
## day 3 336776 15.71 8.77 16 15.70 11.86 1 31
## dep_time 4 328521 1349.11 488.28 1401 1346.82 634.55 1 2400
## sched_dep_time 5 336776 1344.25 467.34 1359 1341.60 613.80 106 2359
## dep_delay 6 328521 12.64 40.21 -2 3.32 5.93 -43 1301
## arr_time 7 328063 1502.05 533.26 1535 1526.42 619.73 1 2400
## sched_arr_time 8 336776 1536.38 497.46 1556 1550.67 618.24 1 2359
## arr_delay 9 327346 6.90 44.63 -5 -1.03 20.76 -86 1272
## carrier* 10 336776 7.14 4.14 6 7.00 5.93 1 16
## flight 11 336776 1971.92 1632.47 1496 1830.51 1608.62 1 8500
## tailnum* 12 334264 1814.32 1199.75 1798 1778.21 1587.86 1 4043
## origin* 13 336776 1.95 0.82 2 1.94 1.48 1 3
## dest* 14 336776 50.03 28.12 50 49.56 32.62 1 105
## air_time 15 327346 150.69 93.69 129 140.03 75.61 20 695
## distance 16 336776 1039.91 733.23 872 955.27 569.32 17 4983
## hour 17 336776 13.18 4.66 13 13.15 5.93 1 23
## minute 18 336776 26.23 19.30 29 25.64 23.72 0 59
## time_hour 19 336776 NaN NA NA NaN NA Inf -Inf
## range skew kurtosis se
## year 0 NaN NaN 0.00
## month 11 -0.01 -1.19 0.01
## day 30 0.01 -1.19 0.02
## dep_time 2399 -0.02 -1.09 0.85
## sched_dep_time 2253 -0.01 -1.20 0.81
## dep_delay 1344 4.80 43.95 0.07
## arr_time 2399 -0.47 -0.19 0.93
## sched_arr_time 2358 -0.35 -0.38 0.86
## arr_delay 1358 3.72 29.23 0.08
## carrier* 15 0.36 -1.21 0.01
## flight 8499 0.66 -0.85 2.81
## tailnum* 4042 0.17 -1.24 2.08
## origin* 2 0.09 -1.50 0.00
## dest* 104 0.13 -1.08 0.05
## air_time 675 1.07 0.86 0.16
## distance 4966 1.13 1.19 1.26
## hour 22 0.00 -1.21 0.01
## minute 59 0.09 -1.24 0.03
## time_hour -Inf NA NA NA
names(flights)
## [1] "year" "month" "day" "dep_time"
## [5] "sched_dep_time" "dep_delay" "arr_time" "sched_arr_time"
## [9] "arr_delay" "carrier" "flight" "tailnum"
## [13] "origin" "dest" "air_time" "distance"
## [17] "hour" "minute" "time_hour"
str(flights)
## tibble [336,776 x 19] (S3: tbl_df/tbl/data.frame)
## $ year : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr [1:336776] "UA" "UA" "AA" "B6" ...
## $ flight : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num [1:336776] 1400 1416 1089 1576 762 ...
## $ hour : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
NYCFlights <- na.omit(flights)
filter(NYCFlights, dep_delay > 1000)
## # A tibble: 5 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 1 10 1121 1635 1126 1239 1810
## 3 2013 6 15 1432 1935 1137 1607 2120
## 4 2013 7 22 845 1600 1005 1044 1815
## 5 2013 9 20 1139 1845 1014 1457 2210
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
dep_delay <- filter(NYCFlights, dep_delay > 1000)
arrange(dep_delay, desc(dep_delay))
## # A tibble: 5 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
summarise(dep_delay, mean_dep_delay = mean(dep_delay))
## # A tibble: 1 x 1
## mean_dep_delay
## <dbl>
## 1 1117.
NYCFlights <- NYCFlights[NYCFlights$dep_delay >= 0,]
mutate(NYCFlights, speed = distance / air_time * 60)
## # A tibble: 144,211 x 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 559 559 0 702 706
## 5 2013 1 1 600 600 0 851 858
## 6 2013 1 1 600 600 0 837 825
## 7 2013 1 1 601 600 1 844 850
## 8 2013 1 1 607 607 0 858 915
## 9 2013 1 1 608 600 8 807 735
## 10 2013 1 1 611 600 11 945 931
## # ... with 144,201 more rows, and 12 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # speed <dbl>
#ggplot(NYCFlights, aes(x = dep_delay)) +
# geom_histogram()+
summary(NYCFlights$dep_delay)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 3.00 15.00 34.76 44.00 1301.00
summary(NYCFlights$arr_delay)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -70.00 -4.00 13.00 30.03 45.00 1272.00
NYCFlights %>%
filter(dep_delay %in% seq(3, 44))
## # A tibble: 77,831 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 533 529 4 850 830
## 2 2013 1 1 608 600 8 807 735
## 3 2013 1 1 611 600 11 945 931
## 4 2013 1 1 613 610 3 925 921
## 5 2013 1 1 623 610 13 920 915
## 6 2013 1 1 632 608 24 740 728
## 7 2013 1 1 644 636 8 931 940
## 8 2013 1 1 709 700 9 852 832
## 9 2013 1 1 732 729 3 1041 1039
## 10 2013 1 1 743 730 13 1107 1100
## # ... with 77,821 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
df <- NYCFlights %>%
drop_na(dep_delay)
length(seq(3,44))
## [1] 42
str(df)
## tibble [144,211 x 19] (S3: tbl_df/tbl/data.frame)
## $ year : int [1:144211] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int [1:144211] 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int [1:144211] 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int [1:144211] 517 533 542 559 600 600 601 607 608 611 ...
## $ sched_dep_time: int [1:144211] 515 529 540 559 600 600 600 607 600 600 ...
## $ dep_delay : num [1:144211] 2 4 2 0 0 0 1 0 8 11 ...
## $ arr_time : int [1:144211] 830 850 923 702 851 837 844 858 807 945 ...
## $ sched_arr_time: int [1:144211] 819 830 850 706 858 825 850 915 735 931 ...
## $ arr_delay : num [1:144211] 11 20 33 -4 -7 12 -6 -17 32 14 ...
## $ carrier : chr [1:144211] "UA" "UA" "AA" "B6" ...
## $ flight : int [1:144211] 1545 1714 1141 1806 371 4650 343 1077 3768 303 ...
## $ tailnum : chr [1:144211] "N14228" "N24211" "N619AA" "N708JB" ...
## $ origin : chr [1:144211] "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr [1:144211] "IAH" "IAH" "MIA" "BOS" ...
## $ air_time : num [1:144211] 227 227 160 44 152 134 147 157 139 366 ...
## $ distance : num [1:144211] 1400 1416 1089 187 1076 ...
## $ hour : num [1:144211] 5 5 5 5 6 6 6 6 6 6 ...
## $ minute : num [1:144211] 15 29 40 59 0 0 0 7 0 0 ...
## $ time_hour : POSIXct[1:144211], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
## - attr(*, "na.action")= 'omit' Named int [1:9430] 472 478 616 644 726 734 755 839 840 841 ...
## ..- attr(*, "names")= chr [1:9430] "472" "478" "616" "644" ...
ggplot(data = df, aes(x = dep_delay)) +
geom_histogram(binwidth = 15)
Here we want to focus on the delays of flights headed to BWI only, we need to first filter the data for for flights headed to JFK (dest == “BWI”) and then make a histogram of only departure delays of only those flights
bwi_Dec_flights <- df %>%
filter(dest == "BWI", month == 12)
ggplot(bwi_Dec_flights, aes(x = dep_delay, y = day, fill = dep_delay) ) +
geom_line(aes(col="red"), stats = "identity" ) +
labs(col = "elephant") +
labs(x = "Departure Delay",
y="December 2013",
title = "Graph of delayed flights in December")
## Warning: Ignoring unknown parameters: stats
This is a representation in form of a line graph showing the variation of departure delay flights for the month of December 2013 in New York City.
bwi_Dec_flights %>%
summarise(mean_dd = mean(dep_delay), sd_dd = sd(dep_delay), n = n())
## # A tibble: 1 x 3
## mean_dd sd_dd n
## <dbl> <dbl> <int>
## 1 42.1 48.7 41
ggplot(data = df, aes(x = origin, fill = dep_delay)) +
geom_bar(stats = "identity" ,col = "yellow")
## Warning: Ignoring unknown parameters: stats
NYCFlights %>%
select(arr_delay ,dep_delay, month, hour,air_time, carrier, dest) %>%
filter((arr_delay == 0 & dep_delay == 0), month == 7, (hour >=10 & hour <= 18), (air_time >= 120 & air_time <= 240), dest == "SJU") %>%
arrange(desc(air_time))
## # A tibble: 2 x 7
## arr_delay dep_delay month hour air_time carrier dest
## <dbl> <dbl> <int> <dbl> <dbl> <chr> <chr>
## 1 0 0 7 12 199 UA SJU
## 2 0 0 7 10 199 B6 SJU
Most_dep_delay <- NYCFlights %>%
select(dep_delay, carrier,dest, month) %>%
filter(month == 12, dep_delay >=60, dest == "BWI") %>%
arrange(desc(dep_delay)) %>%
head(10)
Most_dep_delay
## # A tibble: 10 x 4
## dep_delay carrier dest month
## <dbl> <chr> <chr> <int>
## 1 212 MQ BWI 12
## 2 173 MQ BWI 12
## 3 120 EV BWI 12
## 4 114 EV BWI 12
## 5 107 9E BWI 12
## 6 94 MQ BWI 12
## 7 87 EV BWI 12
## 8 86 EV BWI 12
## 9 83 EV BWI 12
## 10 67 EV BWI 12
Most_dep_delay %>%
ggplot(Most_dep_delay, mapping = aes(x = carrier, Y = dep_delay,fill = carrier )) +
geom_bar(stats = "identity" ,col = "blue") +
labs(x = "carrier", y="Departure delay more than 60mins",
title = "Bargraph showing carriers with more than one hour dep_delay time")
## Warning: Ignoring unknown parameters: stats
The above Bargraph shows the three carriers with departure delay time of more than one hour in the month of December. This is a representation of the Carriers passengers or travelers should avoid because not only do they have delays but these delays last as long as one hour or more in a very busy holiday period when people are trying to get hope for family reunions.
NYCFlights %>%
select(arr_time, origin, dest, distance, carrier, tailnum) %>%
filter( dest == "HNL", distance >= 4000)
## # A tibble: 318 x 6
## arr_time origin dest distance carrier tailnum
## <int> <chr> <chr> <dbl> <chr> <chr>
## 1 2005 EWR HNL 4963 UA N76065
## 2 1525 JFK HNL 4983 HA N380HA
## 3 1940 EWR HNL 4963 UA N77066
## 4 1504 JFK HNL 4983 HA N380HA
## 5 2006 EWR HNL 4963 UA N76064
## 6 1516 JFK HNL 4983 HA N384HA
## 7 1932 EWR HNL 4963 UA N76065
## 8 1558 JFK HNL 4983 HA N385HA
## 9 1927 EWR HNL 4963 UA N69063
## 10 1620 JFK HNL 4983 HA N385HA
## # ... with 308 more rows
Avg_speed_WRT_distance <- NYCFlights %>%
select(arr_time, origin, dest, distance)
Avg_speed_WRT_distance %>%
mutate(avg_speed = distance / (arr_time / 60)) %>%
ggplot(Avg_speed_WRT_distance, mapping= aes(distance, avg_speed)) +
geom_point(col = 'red') +
ggtitle("Scatterplot of Average speed versus Distance")
The above scatterplot is a representation of the relationship between distance per flight and the average speed of the said flight from New York to the rest of the country. Our data didn’t have a variable for average speed so we needed to mutate the data to include average speed which was gotten from from dividing the total distance per flight over the arrival time and multiplying by 60 minutes. After using our select and filter functions above, we noticed that the longest flight New york city is to Honolulu (HNL) and the shortest is to Philadelphia (PHL). There is a direct relationship between distance and average speed up to a certain point when it changes because of the few outliers which tells us that the distance is out of mainland USA.