install.packages(“pacman”) library(pacman) p_load(nycflights13, tidyverse)
library(pacman)
p_load(nycflights13, tidyverse)
data(flights)
data(airlines)
data(weather)
data(airports)
head(flights)
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
head(airlines)
## # A tibble: 6 x 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
head(weather)
## # A tibble: 6 x 15
## origin year month day hour temp dewp humid wind_dir wind_speed wind_gust
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4 NA
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06 NA
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5 NA
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7 NA
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7 NA
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5 NA
## # … with 4 more variables: precip <dbl>, pressure <dbl>, visib <dbl>,
## # time_hour <dttm>
Q1.找出1月1號所有航班資料
head(flights)[flights$month==1&flights$day==1,]
## Warning: Length of logical index must be 1 or 6, not 336776
## # A tibble: 842 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 NA NA NA NA NA NA NA NA
## 8 NA NA NA NA NA NA NA NA
## 9 NA NA NA NA NA NA NA NA
## 10 NA NA NA NA NA NA NA NA
## # … with 832 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Q2. 找出抵達誤點超過二個小時以上所有航班資料,並將arr_delay顯示在第四欄
flights %>% filter(arr_delay >= 120) %>%
select(year:day, arr_delay, dep_time, sched_dep_time, dep_delay, arr_time)
## # A tibble: 10,200 x 8
## year month day arr_delay dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <dbl> <int> <int> <dbl> <int>
## 1 2013 1 1 137 811 630 101 1047
## 2 2013 1 1 851 848 1835 853 1001
## 3 2013 1 1 123 957 733 144 1056
## 4 2013 1 1 145 1114 900 134 1447
## 5 2013 1 1 127 1505 1310 115 1638
## 6 2013 1 1 125 1525 1340 105 1831
## 7 2013 1 1 136 1549 1445 64 1912
## 8 2013 1 1 123 1558 1359 119 1718
## 9 2013 1 1 123 1732 1630 62 2028
## 10 2013 1 1 138 1803 1620 103 2008
## # … with 10,190 more rows
Q3. 將抵達誤點時間, 以遞減的順序進行排列,並找出誤點最久的班機資訊
flights %>%
arrange(desc(arr_delay)) %>%
select(year:day, arr_delay, everything()) %>%
slice(1)
## # A tibble: 1 x 19
## year month day arr_delay dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <dbl> <int> <int> <dbl> <int>
## 1 2013 1 9 1272 641 900 1301 1242
## # … with 11 more variables: sched_arr_time <int>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
Q4. 那家航空公司誤點(arr_delay)最嚴重?平均時間為多少?
flights %>%
group_by(carrier) %>%
summarize(avg_arr_delay = mean(arr_delay,na.rm = T)) %>%
arrange(desc(avg_arr_delay))
## # A tibble: 16 x 2
## carrier avg_arr_delay
## <chr> <dbl>
## 1 F9 21.9
## 2 FL 20.1
## 3 EV 15.8
## 4 YV 15.6
## 5 OO 11.9
## 6 MQ 10.8
## 7 WN 9.65
## 8 B6 9.46
## 9 9E 7.38
## 10 UA 3.56
## 11 US 2.13
## 12 VX 1.76
## 13 DL 1.64
## 14 AA 0.364
## 15 HA -6.92
## 16 AS -9.93
Q5. 要盡可能避開誤點,應該搭乘一天中那個時間的航班?
flights %>%
group_by(hour) %>%
summarize(arr_delay = mean(arr_delay,na.rm = T)) %>%
arrange(desc(arr_delay))
## # A tibble: 20 x 2
## hour arr_delay
## <dbl> <dbl>
## 1 21 18.4
## 2 20 16.7
## 3 19 16.7
## 4 17 16.0
## 5 22 16.0
## 6 18 14.8
## 7 16 12.6
## 8 15 12.3
## 9 23 11.8
## 10 14 9.20
## 11 13 6.54
## 12 12 3.49
## 13 11 1.48
## 14 10 0.954
## 15 8 -1.11
## 16 9 -1.45
## 17 6 -3.38
## 18 5 -4.80
## 19 7 -5.30
## 20 1 NaN
Q6. flights中「依序」選出year:day, hour, origin, dest, tailnum, carrier等欄位,再抽出前100筆記資料,最後和airlines做left_join
answer06 <- local({
slice(flights, 1:100) %>%
select(year:day, hour, origin, dest, tailnum, carrier) %>%
left_join(y = airlines, by = "carrier")
})
head(answer06)
## # A tibble: 6 x 9
## year month day hour origin dest tailnum carrier name
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA United Air Lines Inc.
## 2 2013 1 1 5 LGA IAH N24211 UA United Air Lines Inc.
## 3 2013 1 1 5 JFK MIA N619AA AA American Airlines Inc.
## 4 2013 1 1 5 JFK BQN N804JB B6 JetBlue Airways
## 5 2013 1 1 6 LGA ATL N668DN DL Delta Air Lines Inc.
## 6 2013 1 1 5 EWR ORD N39463 UA United Air Lines Inc.
Q7. 將answer06與weather結合,再與airports結合
left_join(left_join(answer06,weather,by =NULL,copy=FALSE,suffix= c("dest","faa")),airports,by =NULL,copy=FALSE,suffix= c("dest","faa"))
## Joining, by = c("year", "month", "day", "hour", "origin")
## Joining, by = "name"
## # A tibble: 100 x 26
## year month day hour origin dest tailnum carrier name temp dewp humid
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA Unit… 39.0 28.0 64.4
## 2 2013 1 1 5 LGA IAH N24211 UA Unit… 39.9 25.0 54.8
## 3 2013 1 1 5 JFK MIA N619AA AA Amer… 39.0 27.0 61.6
## 4 2013 1 1 5 JFK BQN N804JB B6 JetB… 39.0 27.0 61.6
## 5 2013 1 1 6 LGA ATL N668DN DL Delt… 39.9 25.0 54.8
## 6 2013 1 1 5 EWR ORD N39463 UA Unit… 39.0 28.0 64.4
## 7 2013 1 1 6 EWR FLL N516JB B6 JetB… 37.9 28.0 67.2
## 8 2013 1 1 6 LGA IAD N829AS EV Expr… 39.9 25.0 54.8
## 9 2013 1 1 6 JFK MCO N593JB B6 JetB… 37.9 27.0 64.3
## 10 2013 1 1 6 LGA ORD N3ALAA AA Amer… 39.9 25.0 54.8
## # … with 90 more rows, and 14 more variables: wind_dir <dbl>, wind_speed <dbl>,
## # wind_gust <dbl>, precip <dbl>, pressure <dbl>, visib <dbl>,
## # time_hour <dttm>, faa <chr>, lat <dbl>, lon <dbl>, alt <dbl>, tz <dbl>,
## # dst <chr>, tzone <chr>
Q8. 計算每個月抵達ALB, BDL, BTV三個機場的班機總次數
by_dest=group_by(flights,year,month,dest)
summarise(by_dest,count = n())
## # A tibble: 1,113 x 4
## # Groups: year, month [12]
## year month dest count
## <int> <int> <chr> <int>
## 1 2013 1 ALB 64
## 2 2013 1 ATL 1396
## 3 2013 1 AUS 169
## 4 2013 1 AVL 2
## 5 2013 1 BDL 37
## 6 2013 1 BHM 25
## 7 2013 1 BNA 399
## 8 2013 1 BOS 1245
## 9 2013 1 BQN 93
## 10 2013 1 BTV 223
## # … with 1,103 more rows
Q9. 依始發地、航空公司、及月份(origin, carrier, month) 計算平均起飛延誤(dep_delay)的時間
flights %>%
group_by(origin, carrier, month) %>%
summarize(avg_dep_delay = mean(dep_delay,na.rm = T)) %>%
arrange(desc(avg_dep_delay))
## # A tibble: 399 x 4
## # Groups: origin, carrier [35]
## origin carrier month avg_dep_delay
## <chr> <chr> <int> <dbl>
## 1 LGA OO 1 67
## 2 LGA OO 8 64
## 3 EWR OO 6 61
## 4 JFK HA 1 54.4
## 5 LGA YV 6 42.8
## 6 LGA FL 7 41.2
## 7 JFK VX 7 39.8
## 8 LGA FL 6 38.8
## 9 EWR WN 6 36.2
## 10 LGA F9 5 35.9
## # … with 389 more rows
Q10. 以直方圖畫出分配
hist(weather$temp)