02-17-23library(tidyverse)
library(openintro)As the value of bandwidth is bigger, the highest count of data visualization can be seen in a wider view.
lax_flights <- nycflights %>%
filter(dest == "LAX")
ggplot(data = lax_flights, aes(x = dep_delay)) + geom_histogram()## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
lax_flights %>%
summarise(mean_dd = mean(dep_delay),
median_dd = median(dep_delay),
n = n())## # A tibble: 1 × 3
## mean_dd median_dd n
## <dbl> <dbl> <int>
## 1 9.78 -1 1583
There are 68 flights headed to SFO in February
sfo_feb_flights <- nycflights %>%
filter(dest == "SFO", month == 2)sfo_feb_flights <- nycflights %>%
filter(dest == "SFO")
ggplot(data = sfo_feb_flights, aes(x = arr_delay)) + geom_histogram()## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
sfo_feb_flights %>%
group_by (carrier) %>%
summarise(median_dd = median(arr_delay), iqr_dd = IQR(arr_delay), n_flights = n())## # A tibble: 5 × 4
## carrier median_dd iqr_dd n_flights
## <chr> <dbl> <dbl> <int>
## 1 AA -0.5 43.8 136
## 2 B6 -9 35.2 96
## 3 DL -12 27 205
## 4 UA -4 34 685
## 5 VX -12 33 223
Pros: In the month of lowest mean departure delay, we save waiting-time when we schedule for travel.
Cons: In the month of highest median departure delay, the arrival time to our destination will be late and travalling schedule will be disrupted.
nycflights %>%
group_by(month) %>%
summarise(mean_dd = mean(dep_delay)) %>%
arrange(desc(mean_dd))## # A tibble: 12 × 2
## month mean_dd
## <int> <dbl>
## 1 7 20.8
## 2 6 20.4
## 3 12 17.4
## 4 4 14.6
## 5 3 13.5
## 6 5 13.3
## 7 8 12.6
## 8 2 10.7
## 9 1 10.2
## 10 9 6.87
## 11 11 6.10
## 12 10 5.88
nycflights %>%
group_by(month) %>%
summarise(min = min(mean(dep_delay)), max = max(median(dep_delay)))## # A tibble: 12 × 3
## month min max
## <int> <dbl> <dbl>
## 1 1 10.2 -2
## 2 2 10.7 -2
## 3 3 13.5 -1
## 4 4 14.6 -2
## 5 5 13.3 -1
## 6 6 20.4 0
## 7 7 20.8 0
## 8 8 12.6 -1
## 9 9 6.87 -3
## 10 10 5.88 -3
## 11 11 6.10 -2
## 12 12 17.4 1
The EWR air port with minimum ontime departure rate should be chosen.
nycflights <- nycflights %>%
mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed"))
nycflights %>%
group_by(origin) %>%
summarise(ot_dep_rate = sum(dep_type == "on time") / n()) %>%
arrange(desc(ot_dep_rate))## # A tibble: 3 × 2
## origin ot_dep_rate
## <chr> <dbl>
## 1 LGA 0.728
## 2 JFK 0.694
## 3 EWR 0.637
ggplot(data = nycflights, aes(x = origin, fill = dep_type)) + geom_bar()nycflights %>%
group_by(origin) %>%
summarise(avg_speed_mph = sum(distance * 60 / air_time) / n()) %>%
arrange(desc(avg_speed_mph))## # A tibble: 3 × 2
## origin avg_speed_mph
## <chr> <dbl>
## 1 JFK 398.
## 2 EWR 396.
## 3 LGA 387.
nycflights <- nycflights %>%
mutate(avg_speed = distance * 60 / air_time / n())
ggplot(data = nycflights, aes(x = distance, y = avg_speed)) + geom_point()The cutoff point is 5 for departure delays that we estimate our arrival time is on time.
names(nycflights)## [1] "year" "month" "day" "dep_time" "dep_delay" "arr_time"
## [7] "arr_delay" "carrier" "tailnum" "flight" "origin" "dest"
## [13] "air_time" "distance" "hour" "minute" "dep_type" "avg_speed"
ggplot(data = nycflights) + geom_point(mapping = aes(x = dep_delay, y = arr_delay, color = carrier))ggplot(data = nycflights) + geom_point(mapping = aes(x = dep_delay, y = arr_delay, color = dep_type))