#loading tha library and data
library(nycflights13)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.4 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
fl <-flights
view(flights)
?flights
## starting httpd help server ...
## done
filter(flights, month== 1 ,day==1 )
## # A tibble: 842 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ... with 832 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
jan1<-filter(flights, month== 1 ,day==1 )
(nov_dec<-filter(flights , month==11 | month==12))
## # A tibble: 55,403 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 11 1 5 2359 6 352 345
## 2 2013 11 1 35 2250 105 123 2356
## 3 2013 11 1 455 500 -5 641 651
## 4 2013 11 1 539 545 -6 856 827
## 5 2013 11 1 542 545 -3 831 855
## 6 2013 11 1 549 600 -11 912 923
## 7 2013 11 1 550 600 -10 705 659
## 8 2013 11 1 554 600 -6 659 701
## 9 2013 11 1 554 600 -6 826 827
## 10 2013 11 1 554 600 -6 749 751
## # ... with 55,393 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
(no_delay<- filter(flights , dep_delay==0 & arr_delay== 0 ))
## # A tibble: 347 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 2 600 600 0 846 846
## 2 2013 1 2 1437 1437 0 1742 1742
## 3 2013 1 3 835 835 0 1102 1102
## 4 2013 1 3 1245 1245 0 1600 1600
## 5 2013 1 4 2005 2005 0 2311 2311
## 6 2013 1 6 937 937 0 1102 1102
## 7 2013 1 6 1515 1515 0 1700 1700
## 8 2013 1 6 1932 1932 0 2243 2243
## 9 2013 1 6 2030 2030 0 2258 2258
## 10 2013 1 8 1030 1030 0 1252 1252
## # ... with 337 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
(not_late<- filter(flights ,arr_delay == 0))
## # A tibble: 5,409 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 627 630 -3 1018 1018
## 2 2013 1 1 807 810 -3 1043 1043
## 3 2013 1 1 956 1000 -4 1241 1241
## 4 2013 1 1 1124 1125 -1 1445 1445
## 5 2013 1 1 1219 1220 -1 1415 1415
## 6 2013 1 1 1240 1235 5 1415 1415
## 7 2013 1 1 1248 1250 -2 1607 1607
## 8 2013 1 1 1333 1335 -2 1608 1608
## 9 2013 1 1 1459 1501 -2 1651 1651
## 10 2013 1 1 1510 1517 -7 1811 1811
## # ... with 5,399 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
arrange(flights, year, month ,day)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
arrange(flights , desc(dep_delay))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 6 27 959 1900 899 1236 2226
## 9 2013 7 22 2257 759 898 121 1026
## 10 2013 12 5 756 1700 896 1058 2020
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
select (flights , dep_delay)
## # A tibble: 336,776 x 1
## dep_delay
## <dbl>
## 1 2
## 2 4
## 3 2
## 4 -1
## 5 -6
## 6 -4
## 7 -5
## 8 -3
## 9 -3
## 10 -2
## # ... with 336,766 more rows
select (flights , dep_delay, everything())
## # A tibble: 336,776 x 19
## dep_delay year month day dep_time sched_dep_time arr_time sched_arr_time
## <dbl> <int> <int> <int> <int> <int> <int> <int>
## 1 2 2013 1 1 517 515 830 819
## 2 4 2013 1 1 533 529 850 830
## 3 2 2013 1 1 542 540 923 850
## 4 -1 2013 1 1 544 545 1004 1022
## 5 -6 2013 1 1 554 600 812 837
## 6 -4 2013 1 1 554 558 740 728
## 7 -5 2013 1 1 555 600 913 854
## 8 -3 2013 1 1 557 600 709 723
## 9 -3 2013 1 1 557 600 838 846
## 10 -2 2013 1 1 558 600 753 745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
(M_flights <- flights[starts_with("M", vars = flights$dest),])
## # A tibble: 49,382 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 542 540 2 923 850
## 2 2013 1 1 557 600 -3 838 846
## 3 2013 1 1 602 610 -8 812 820
## 4 2013 1 1 606 610 -4 858 910
## 5 2013 1 1 607 607 0 858 915
## 6 2013 1 1 623 610 13 920 915
## 7 2013 1 1 624 630 -6 909 840
## 8 2013 1 1 624 630 -6 840 830
## 9 2013 1 1 637 645 -8 930 935
## 10 2013 1 1 652 655 -3 932 921
## # ... with 49,372 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights_sml <- select( flights,
year:day,
ends_with("delay"),
distance,
air_time
)
flights_sml_add<- mutate(flights_sml,
gain = dep_delay - arr_delay,
speed = distance / air_time *60,
hours= air_time /60,
gain_per_hours = gain /hours)
transmute(flights,
gain = dep_delay - arr_delay,
hours= air_time /60,
gain_per_hours = gain /hours)
## # A tibble: 336,776 x 3
## gain hours gain_per_hours
## <dbl> <dbl> <dbl>
## 1 -9 3.78 -2.38
## 2 -16 3.78 -4.23
## 3 -31 2.67 -11.6
## 4 17 3.05 5.57
## 5 19 1.93 9.83
## 6 -16 2.5 -6.4
## 7 -24 2.63 -9.11
## 8 11 0.883 12.5
## 9 5 2.33 2.14
## 10 -10 2.3 -4.35
## # ... with 336,766 more rows
transmute_test <-transmute(flights,
gain = dep_delay - arr_delay,
hours= air_time /60,
gain_per_hours = gain /hours)
transmute_test2 <-transmute(flights_sml,
gain = dep_delay - arr_delay,
hours= air_time /60,
gain_per_hours = gain /hours)
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 1 x 1
## delay
## <dbl>
## 1 12.6
(by_dest <- group_by(flights, dest))
## # A tibble: 336,776 x 19
## # Groups: dest [105]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
delay <- summarise(by_dest,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
)
## `summarise()` ungrouping output (override with `.groups` argument)
delay <- filter(delay, count > 20, dest != "HNL")
ggplot(data= delay, mapping = aes(x= dist ,y= delay ))+
geom_point(aes(size =count), alpha= 1/3)+
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
delay_s <- flights %>%
group_by(dest)%>%
summarise(count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE
))%>%
filter(count > 20 , dest != "HNL")
## `summarise()` ungrouping output (override with `.groups` argument)
summary(fl$dep_delay) # 8255 NA's in there
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -43.00 -5.00 -2.00 12.64 11.00 1301.00 8255
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay))
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.4
## 2 2013 1 2 13.7
## 3 2013 1 3 10.9
## 4 2013 1 4 8.97
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.56
## 9 2013 1 9 2.30
## 10 2013 1 10 2.84
## # ... with 355 more rows
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(
delay = mean(arr_delay, na.rm = TRUE),
n = n()
)
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(data = delays, mapping = aes(x = n, y = delay)) +
geom_point(alpha = 1/10)
# do filter with the same previous graph
delays %>%
filter(n > 25) %>%
ggplot(mapping = aes(x = n, y = delay)) +
geom_point(alpha = 1/10)