install.packages("pacman")
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
library(pacman)
p_load(nycflights13, tidyverse)
#Q1
filter(flights, year>1)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#Q2
filter(flights, arr_delay>= 120)
## # A tibble: 10,200 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 811 630 101 1047 830
## 2 2013 1 1 848 1835 853 1001 1950
## 3 2013 1 1 957 733 144 1056 853
## 4 2013 1 1 1114 900 134 1447 1222
## 5 2013 1 1 1505 1310 115 1638 1431
## 6 2013 1 1 1525 1340 105 1831 1626
## 7 2013 1 1 1549 1445 64 1912 1656
## 8 2013 1 1 1558 1359 119 1718 1515
## 9 2013 1 1 1732 1630 62 2028 1825
## 10 2013 1 1 1803 1620 103 2008 1750
## # … with 10,190 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
select(flights, year:day,arr_delay,dep_time,sched_dep_time,dep_delay,arr_time)
## # A tibble: 336,776 x 8
## year month day arr_delay dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <dbl> <int> <int> <dbl> <int>
## 1 2013 1 1 11 517 515 2 830
## 2 2013 1 1 20 533 529 4 850
## 3 2013 1 1 33 542 540 2 923
## 4 2013 1 1 -18 544 545 -1 1004
## 5 2013 1 1 -25 554 600 -6 812
## 6 2013 1 1 12 554 558 -4 740
## 7 2013 1 1 19 555 600 -5 913
## 8 2013 1 1 -14 557 600 -3 709
## 9 2013 1 1 -8 557 600 -3 838
## 10 2013 1 1 8 558 600 -2 753
## # … with 336,766 more rows
#03
flights %>%
arrange(desc(arr_delay)) %>%
select(year:day, carrier,flight,arr_delay,dep_time,sched_dep_time,dep_delay) %>%
slice(1)
## # A tibble: 1 x 9
## year month day carrier flight arr_delay dep_time sched_dep_time dep_delay
## <int> <int> <int> <chr> <int> <dbl> <int> <int> <dbl>
## 1 2013 1 9 HA 51 1272 641 900 1301
#Q4
flights %>%
group_by(carrier) %>%
summarise(arr_delay=mean(arr_delay,na.rm = TRUE)) %>%
arrange(desc(arr_delay))
## # A tibble: 16 x 2
## carrier arr_delay
## <chr> <dbl>
## 1 F9 21.9
## 2 FL 20.1
## 3 EV 15.8
## 4 YV 15.6
## 5 OO 11.9
## 6 MQ 10.8
## 7 WN 9.65
## 8 B6 9.46
## 9 9E 7.38
## 10 UA 3.56
## 11 US 2.13
## 12 VX 1.76
## 13 DL 1.64
## 14 AA 0.364
## 15 HA -6.92
## 16 AS -9.93
#Q5
flights %>%
group_by(hour)%>%
summarise(arr_delay=mean(arr_delay,na.rm = TRUE)) %>%
arrange(desc(arr_delay))
## # A tibble: 20 x 2
## hour arr_delay
## <dbl> <dbl>
## 1 21 18.4
## 2 20 16.7
## 3 19 16.7
## 4 17 16.0
## 5 22 16.0
## 6 18 14.8
## 7 16 12.6
## 8 15 12.3
## 9 23 11.8
## 10 14 9.20
## 11 13 6.54
## 12 12 3.49
## 13 11 1.48
## 14 10 0.954
## 15 8 -1.11
## 16 9 -1.45
## 17 6 -3.38
## 18 5 -4.80
## 19 7 -5.30
## 20 1 NaN
#Q6
select(flights,year:day,hour, origin, dest, tailnum, carrier)
## # A tibble: 336,776 x 8
## year month day hour origin dest tailnum carrier
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA
## 2 2013 1 1 5 LGA IAH N24211 UA
## 3 2013 1 1 5 JFK MIA N619AA AA
## 4 2013 1 1 5 JFK BQN N804JB B6
## 5 2013 1 1 6 LGA ATL N668DN DL
## 6 2013 1 1 5 EWR ORD N39463 UA
## 7 2013 1 1 6 EWR FLL N516JB B6
## 8 2013 1 1 6 LGA IAD N829AS EV
## 9 2013 1 1 6 JFK MCO N593JB B6
## 10 2013 1 1 6 LGA ORD N3ALAA AA
## # … with 336,766 more rows
left_join(airlines,flights)
## Joining, by = "carrier"
## # A tibble: 336,776 x 20
## carrier name year month day dep_time sched_dep_time dep_delay arr_time
## <chr> <chr> <int> <int> <int> <int> <int> <dbl> <int>
## 1 9E Ende… 2013 1 1 810 810 0 1048
## 2 9E Ende… 2013 1 1 1451 1500 -9 1634
## 3 9E Ende… 2013 1 1 1452 1455 -3 1637
## 4 9E Ende… 2013 1 1 1454 1500 -6 1635
## 5 9E Ende… 2013 1 1 1507 1515 -8 1651
## 6 9E Ende… 2013 1 1 1530 1530 0 1650
## 7 9E Ende… 2013 1 1 1546 1540 6 1753
## 8 9E Ende… 2013 1 1 1550 1550 0 1844
## 9 9E Ende… 2013 1 1 1552 1600 -8 1749
## 10 9E Ende… 2013 1 1 1554 1600 -6 1701
## # … with 336,766 more rows, and 11 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
slice(flights, 1:100) %>%
select(year:day, hour, origin, dest, tailnum, carrier) %>%
left_join(y = airlines, by = "carrier") %>% slice(1:6)
## # A tibble: 6 x 9
## year month day hour origin dest tailnum carrier name
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA United Air Lines Inc.
## 2 2013 1 1 5 LGA IAH N24211 UA United Air Lines Inc.
## 3 2013 1 1 5 JFK MIA N619AA AA American Airlines Inc.
## 4 2013 1 1 5 JFK BQN N804JB B6 JetBlue Airways
## 5 2013 1 1 6 LGA ATL N668DN DL Delta Air Lines Inc.
## 6 2013 1 1 5 EWR ORD N39463 UA United Air Lines Inc.
#Q7
#Q8
#Q9
flights %>%
group_by(carrier,month,origin) %>%
summarise(dep_delay=mean(dep_delay,na.rm = TRUE)) %>%
arrange(desc(dep_delay))
## # A tibble: 399 x 4
## # Groups: carrier, month [185]
## carrier month origin dep_delay
## <chr> <int> <chr> <dbl>
## 1 OO 1 LGA 67
## 2 OO 8 LGA 64
## 3 OO 6 EWR 61
## 4 HA 1 JFK 54.4
## 5 YV 6 LGA 42.8
## 6 FL 7 LGA 41.2
## 7 VX 7 JFK 39.8
## 8 FL 6 LGA 38.8
## 9 WN 6 EWR 36.2
## 10 F9 5 LGA 35.9
## # … with 389 more rows
#Q10
hist(x=weather$temp,
main="2013 New York City Airport Temperature",
xlab="Temperature",
ylab="Frequency")
