October 1, 2024## [1] "year" "month" "day" "dep_time" "dep_delay" "arr_time"
## [7] "arr_delay" "carrier" "tailnum" "flight" "origin" "dest"
## [13] "air_time" "distance" "hour" "minute"
## Rows: 32,735
## Columns: 16
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, …
## $ month <int> 6, 5, 12, 5, 7, 1, 12, 8, 9, 4, 6, 11, 4, 3, 10, 1, 2, 8, 10…
## $ day <int> 30, 7, 8, 14, 21, 1, 9, 13, 26, 30, 17, 22, 26, 25, 21, 23, …
## $ dep_time <int> 940, 1657, 859, 1841, 1102, 1817, 1259, 1920, 725, 1323, 940…
## $ dep_delay <dbl> 15, -3, -1, -4, -3, -3, 14, 85, -10, 62, 5, 5, -2, 115, -4, …
## $ arr_time <int> 1216, 2104, 1238, 2122, 1230, 2008, 1617, 2032, 1027, 1549, …
## $ arr_delay <dbl> -4, 10, 11, -34, -8, 3, 22, 71, -8, 60, -4, -2, 22, 91, -6, …
## $ carrier <chr> "VX", "DL", "DL", "DL", "9E", "AA", "WN", "B6", "AA", "EV", …
## $ tailnum <chr> "N626VA", "N3760C", "N712TW", "N914DL", "N823AY", "N3AXAA", …
## $ flight <int> 407, 329, 422, 2391, 3652, 353, 1428, 1407, 2279, 4162, 20, …
## $ origin <chr> "JFK", "JFK", "JFK", "JFK", "LGA", "LGA", "EWR", "JFK", "LGA…
## $ dest <chr> "LAX", "SJU", "LAX", "TPA", "ORF", "ORD", "HOU", "IAD", "MIA…
## $ air_time <dbl> 313, 216, 376, 135, 50, 138, 240, 48, 148, 110, 50, 161, 87,…
## $ distance <dbl> 2475, 1598, 2475, 1005, 296, 733, 1411, 228, 1096, 820, 264,…
## $ hour <dbl> 9, 16, 8, 18, 11, 18, 12, 19, 7, 13, 9, 13, 8, 20, 12, 20, 6…
## $ minute <dbl> 40, 57, 59, 41, 2, 17, 59, 20, 25, 23, 40, 20, 9, 54, 17, 24…
#The three histograms show that a larger “binwidth” (150) will give a wider count of the delayed flights on the histogram and the smaller “binwidth” (15) appears less accurate because of how squished and small it is.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#there were 68 flights departing new york and heading to SFO in february.
lax_flights <- nycflights %>%
filter(dest == "LAX")
ggplot(data = lax_flights, aes(x = dep_delay)) +
geom_histogram()## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
sfo_feb_flights <- nycflights %>%
filter(dest == "SFO", month == 2)
ggplot(data = sfo_feb_flights, aes(x = dep_delay)) +
geom_histogram()## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Rows: 68
## Columns: 16
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, …
## $ month <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ day <int> 18, 3, 15, 18, 24, 25, 7, 15, 13, 8, 11, 13, 25, 20, 12, 27,…
## $ dep_time <int> 1527, 613, 955, 1928, 1340, 1415, 1032, 1805, 1056, 656, 191…
## $ dep_delay <dbl> 57, 14, -5, 15, 2, -10, 1, 20, -4, -4, 40, -2, -1, -6, -7, 2…
## $ arr_time <int> 1903, 1008, 1313, 2239, 1644, 1737, 1352, 2122, 1412, 1039, …
## $ arr_delay <dbl> 48, 38, -28, -6, -21, -13, -10, 2, -13, -6, 2, -5, -30, -22,…
## $ carrier <chr> "DL", "UA", "DL", "UA", "UA", "UA", "B6", "AA", "UA", "DL", …
## $ tailnum <chr> "N711ZX", "N502UA", "N717TW", "N24212", "N76269", "N532UA", …
## $ flight <int> 1322, 691, 1765, 1214, 1111, 394, 641, 177, 642, 1865, 272, …
## $ origin <chr> "JFK", "JFK", "JFK", "EWR", "EWR", "JFK", "JFK", "JFK", "JFK…
## $ dest <chr> "SFO", "SFO", "SFO", "SFO", "SFO", "SFO", "SFO", "SFO", "SFO…
## $ air_time <dbl> 358, 367, 338, 353, 341, 355, 359, 338, 347, 361, 332, 351, …
## $ distance <dbl> 2586, 2586, 2586, 2565, 2565, 2586, 2586, 2586, 2586, 2586, …
## $ hour <dbl> 15, 6, 9, 19, 13, 14, 10, 18, 10, 6, 19, 8, 10, 18, 7, 17, 1…
## $ minute <dbl> 27, 13, 55, 28, 40, 15, 32, 5, 56, 56, 10, 33, 48, 49, 23, 2…
#The arrival delays show that it was more common for flights to SFO in February were earlier and on time than being late. It deviates to the left more.
## [1] 48 38 -28 -6 -21 -13 -10 2 -13 -6 2 -5 -30 -22 -40 -1 -17 -24 21
## [20] -13 -5 -6 34 -45 -18 -14 -11 45 -48 8 -3 -3 -23 -35 99 -18 -17 18
## [39] -5 -20 11 3 -2 8 -27 -30 -30 16 -66 196 76 -10 -11 1 -7 -13 -26
## [58] -9 9 7 -15 -35 -20 -14 -20 -20 -34 -39
#DL has the most variable delays
#February is expected to have the highest average delay departing from a NYC airport.
sfo_feb_flights %>%
group_by(origin) %>%
summarise(median_dd = median(dep_delay), iqr_dd = IQR(dep_delay), n_flights = n())## # A tibble: 2 × 4
## origin median_dd iqr_dd n_flights
## <chr> <dbl> <dbl> <int>
## 1 EWR 0.5 5.75 8
## 2 JFK -2.5 15.2 60
## # A tibble: 1 × 2
## mean_ard iqr_ard
## <dbl> <dbl>
## 1 -4.5 23.2
## # A tibble: 68 × 16
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## <int> <int> <int> <int> <dbl> <int> <dbl> <chr> <chr>
## 1 2013 2 15 1805 20 2122 2 AA N335AA
## 2 2013 2 4 1107 37 1440 45 AA N343AA
## 3 2013 2 27 1830 45 2128 8 AA N329AA
## 4 2013 2 7 1741 -4 2117 -3 AA N335AA
## 5 2013 2 24 1547 17 1928 18 AA N381AA
## 6 2013 2 5 744 -1 1133 8 AA N383AA
## 7 2013 2 25 916 91 1241 76 AA N335AA
## 8 2013 2 25 1030 0 1356 1 AA N367AA
## 9 2013 2 11 1539 9 1844 -26 AA N352AA
## 10 2013 2 21 1745 0 2106 -14 AA N329AA
## # ℹ 58 more rows
## # ℹ 7 more variables: flight <int>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>
## # A tibble: 12 × 2
## month mean_dd
## <int> <dbl>
## 1 7 20.8
## 2 6 20.4
## 3 12 17.4
## 4 4 14.6
## 5 3 13.5
## 6 5 13.3
## 7 8 12.6
## 8 2 10.7
## 9 1 10.2
## 10 9 6.87
## 11 11 6.10
## 12 10 5.88
#If I were to make my decision based from the month with the lowest mean, I would have a disadvantage because the mean is not the best representation of the data itself. Choosing the lowest median would be the more advantageous decision because it is more representative, thus painting a better picture.
nycflights <- nycflights %>%
mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed"))
nycflights %>%
group_by(origin) %>%
summarise(ot_dep_rate = sum(dep_type == "on time") / n()) %>%
arrange(desc(ot_dep_rate))## # A tibble: 3 × 2
## origin ot_dep_rate
## <chr> <dbl>
## 1 LGA 0.728
## 2 JFK 0.694
## 3 EWR 0.637
#I would choose LGA based on time departure percentage.
nycflights <- nycflights %>%
mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed"))
nycflights %>%
group_by(origin) %>%
summarise(ot_dep_rate = sum(dep_type == "on time") / n()) %>%
arrange(desc(ot_dep_rate))## # A tibble: 3 × 2
## origin ot_dep_rate
## <chr> <dbl>
## 1 LGA 0.728
## 2 JFK 0.694
## 3 EWR 0.637
nycflights <- nycflights %>%
mutate(avg_speed = distance/(air_time/60))
nycflights <- nycflights %>%
mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed"))#For shorter distances, the average is lower and as the minimum distance reaches about 500, then the average speed levels out and stays the same as the distance grows.
nycflights_of_AA_DA_UA <- nycflights %>%
filter(carrier == "AA" | carrier == "DL" | carrier == "UA")
ggplot(data = nycflights_of_AA_DA_UA, aes(x = dep_delay, y = arr_delay, color= carrier)) + geom_point()