data(nycflights)
names(nycflights)
## [1] "year" "month" "day" "dep_time" "dep_delay" "arr_time"
## [7] "arr_delay" "carrier" "tailnum" "flight" "origin" "dest"
## [13] "air_time" "distance" "hour" "minute"
?nycflights
glimpse(nycflights)
## Rows: 32,735
## Columns: 16
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, …
## $ month <int> 6, 5, 12, 5, 7, 1, 12, 8, 9, 4, 6, 11, 4, 3, 10, 1, 2, 8, 10…
## $ day <int> 30, 7, 8, 14, 21, 1, 9, 13, 26, 30, 17, 22, 26, 25, 21, 23, …
## $ dep_time <int> 940, 1657, 859, 1841, 1102, 1817, 1259, 1920, 725, 1323, 940…
## $ dep_delay <dbl> 15, -3, -1, -4, -3, -3, 14, 85, -10, 62, 5, 5, -2, 115, -4, …
## $ arr_time <int> 1216, 2104, 1238, 2122, 1230, 2008, 1617, 2032, 1027, 1549, …
## $ arr_delay <dbl> -4, 10, 11, -34, -8, 3, 22, 71, -8, 60, -4, -2, 22, 91, -6, …
## $ carrier <chr> "VX", "DL", "DL", "DL", "9E", "AA", "WN", "B6", "AA", "EV", …
## $ tailnum <chr> "N626VA", "N3760C", "N712TW", "N914DL", "N823AY", "N3AXAA", …
## $ flight <int> 407, 329, 422, 2391, 3652, 353, 1428, 1407, 2279, 4162, 20, …
## $ origin <chr> "JFK", "JFK", "JFK", "JFK", "LGA", "LGA", "EWR", "JFK", "LGA…
## $ dest <chr> "LAX", "SJU", "LAX", "TPA", "ORF", "ORD", "HOU", "IAD", "MIA…
## $ air_time <dbl> 313, 216, 376, 135, 50, 138, 240, 48, 148, 110, 50, 161, 87,…
## $ distance <dbl> 2475, 1598, 2475, 1005, 296, 733, 1411, 228, 1096, 820, 264,…
## $ hour <dbl> 9, 16, 8, 18, 11, 18, 12, 19, 7, 13, 9, 13, 8, 20, 12, 20, 6…
## $ minute <dbl> 40, 57, 59, 41, 2, 17, 59, 20, 25, 23, 40, 20, 9, 54, 17, 24…
ggplot(data = nycflights, aes(x = dep_delay))+
geom_histogram(colour = "palevioletred1", size = 4)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = nycflights, aes(x = dep_delay))+
geom_histogram(colour = "palegreen", binwidth = 15)+
theme_light()
ggplot(data = nycflights, aes(x = dep_delay))+
geom_histogram(colour = "palevioletred1", size = 3, binwidth = 150)+
theme_classic()
lax_flights <- nycflights %>%
dplyr:: filter(dest == "LAX")
ggplot(data = lax_flights, aes(x = dep_delay))+
geom_histogram(colour = "palegreen1", size = 3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
lax_flights %>%
summarise(mean_dd = mean(dep_delay),
median_dd = median(dep_delay),
n = n())
## # A tibble: 1 × 3
## mean_dd median_dd n
## <dbl> <dbl> <int>
## 1 9.78 -1 1583
sfo_feb_flights <- nycflights %>%
dplyr:: filter(dest == "SFO", month == 2)
sfo_feb_flights %>%
group_by(origin) %>%
summarise(median_dd = median(dep_delay), iqr_dd = IQR(dep_delay), n_flights = n())
## # A tibble: 2 × 4
## origin median_dd iqr_dd n_flights
## <chr> <dbl> <dbl> <int>
## 1 EWR 0.5 5.75 8
## 2 JFK -2.5 15.2 60
ggplot(sfo_feb_flights, aes(x = arr_delay)) + geom_histogram(binwidth=15, colour = "palevioletred3", size = 3)
sfo_feb_flights %>%
summarise(mean_ad = mean(arr_delay), median_ad = median(arr_delay), iqr_ad = IQR(arr_delay), n_flights = n())
## # A tibble: 1 × 4
## mean_ad median_ad iqr_ad n_flights
## <dbl> <dbl> <dbl> <int>
## 1 -4.5 -11 23.2 68
sfo_feb_flights %>%
group_by(carrier) %>%
summarize(var_arr_delay = mean(var(arr_delay))) %>%
arrange(desc(var_arr_delay))
## # A tibble: 5 × 2
## carrier var_arr_delay
## <chr> <dbl>
## 1 UA 2335.
## 2 VX 1669.
## 3 AA 868.
## 4 DL 485.
## 5 B6 121.
nycflights %>%
group_by(month) %>%
summarise(mean_dd = mean(dep_delay)) %>%
arrange(desc(mean_dd))
## # A tibble: 12 × 2
## month mean_dd
## <int> <dbl>
## 1 7 20.8
## 2 6 20.4
## 3 12 17.4
## 4 4 14.6
## 5 3 13.5
## 6 5 13.3
## 7 8 12.6
## 8 2 10.7
## 9 1 10.2
## 10 9 6.87
## 11 11 6.10
## 12 10 5.88
nycflights <- nycflights %>%
mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed"))
nycflights %>%
group_by(origin) %>%
summarise(ot_dep_rate = sum(dep_type == "on time") / n()) %>%
arrange(desc(ot_dep_rate))
## # A tibble: 3 × 2
## origin ot_dep_rate
## <chr> <dbl>
## 1 LGA 0.728
## 2 JFK 0.694
## 3 EWR 0.637
ggplot(data = nycflights, aes(x = origin, fill = dep_type))+
geom_bar()
nycflights <- nycflights %>%
mutate(nycflights , avg_speed = distance / air_time)
nycflights %>%
group_by(tailnum) %>%
summarise( avg_speed = mean(avg_speed) ) %>%
arrange(desc(avg_speed))
## # A tibble: 3,490 × 2
## tailnum avg_speed
## <chr> <dbl>
## 1 N526AS 8.49
## 2 N637DL 8.43
## 3 N66051 8.41
## 4 N907JB 8.41
## 5 N522VA 8.38
## 6 N5BTAA 8.32
## 7 N654UA 8.31
## 8 N382HA 8.25
## 9 N75861 8.25
## 10 N5DRAA 8.22
## # ℹ 3,480 more rows
nycflights %>% ggplot() +
geom_point(aes(x = avg_speed, y = distance, color = carrier))