#loading tha library and data

library(nycflights13)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
fl <-flights
view(flights)
?flights
## starting httpd help server ...
##  done

Filtering

filter(flights, month== 1 ,day==1 )
## # A tibble: 842 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 832 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
jan1<-filter(flights, month== 1 ,day==1 )

(nov_dec<-filter(flights , month==11 | month==12))
## # A tibble: 55,403 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    11     1        5           2359         6      352            345
##  2  2013    11     1       35           2250       105      123           2356
##  3  2013    11     1      455            500        -5      641            651
##  4  2013    11     1      539            545        -6      856            827
##  5  2013    11     1      542            545        -3      831            855
##  6  2013    11     1      549            600       -11      912            923
##  7  2013    11     1      550            600       -10      705            659
##  8  2013    11     1      554            600        -6      659            701
##  9  2013    11     1      554            600        -6      826            827
## 10  2013    11     1      554            600        -6      749            751
## # ... with 55,393 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
(no_delay<- filter(flights , dep_delay==0 & arr_delay== 0  ))
## # A tibble: 347 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     2      600            600         0      846            846
##  2  2013     1     2     1437           1437         0     1742           1742
##  3  2013     1     3      835            835         0     1102           1102
##  4  2013     1     3     1245           1245         0     1600           1600
##  5  2013     1     4     2005           2005         0     2311           2311
##  6  2013     1     6      937            937         0     1102           1102
##  7  2013     1     6     1515           1515         0     1700           1700
##  8  2013     1     6     1932           1932         0     2243           2243
##  9  2013     1     6     2030           2030         0     2258           2258
## 10  2013     1     8     1030           1030         0     1252           1252
## # ... with 337 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
(not_late<- filter(flights ,arr_delay == 0))
## # A tibble: 5,409 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      627            630        -3     1018           1018
##  2  2013     1     1      807            810        -3     1043           1043
##  3  2013     1     1      956           1000        -4     1241           1241
##  4  2013     1     1     1124           1125        -1     1445           1445
##  5  2013     1     1     1219           1220        -1     1415           1415
##  6  2013     1     1     1240           1235         5     1415           1415
##  7  2013     1     1     1248           1250        -2     1607           1607
##  8  2013     1     1     1333           1335        -2     1608           1608
##  9  2013     1     1     1459           1501        -2     1651           1651
## 10  2013     1     1     1510           1517        -7     1811           1811
## # ... with 5,399 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Arrange Function

arrange(flights, year, month ,day)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
arrange(flights , desc(dep_delay))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     6    27      959           1900       899     1236           2226
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013    12     5      756           1700       896     1058           2020
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Select Function

select (flights , dep_delay)
## # A tibble: 336,776 x 1
##    dep_delay
##        <dbl>
##  1         2
##  2         4
##  3         2
##  4        -1
##  5        -6
##  6        -4
##  7        -5
##  8        -3
##  9        -3
## 10        -2
## # ... with 336,766 more rows
select (flights , dep_delay, everything())
## # A tibble: 336,776 x 19
##    dep_delay  year month   day dep_time sched_dep_time arr_time sched_arr_time
##        <dbl> <int> <int> <int>    <int>          <int>    <int>          <int>
##  1         2  2013     1     1      517            515      830            819
##  2         4  2013     1     1      533            529      850            830
##  3         2  2013     1     1      542            540      923            850
##  4        -1  2013     1     1      544            545     1004           1022
##  5        -6  2013     1     1      554            600      812            837
##  6        -4  2013     1     1      554            558      740            728
##  7        -5  2013     1     1      555            600      913            854
##  8        -3  2013     1     1      557            600      709            723
##  9        -3  2013     1     1      557            600      838            846
## 10        -2  2013     1     1      558            600      753            745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
(M_flights <- flights[starts_with("M", vars = flights$dest),]) 
## # A tibble: 49,382 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      542            540         2      923            850
##  2  2013     1     1      557            600        -3      838            846
##  3  2013     1     1      602            610        -8      812            820
##  4  2013     1     1      606            610        -4      858            910
##  5  2013     1     1      607            607         0      858            915
##  6  2013     1     1      623            610        13      920            915
##  7  2013     1     1      624            630        -6      909            840
##  8  2013     1     1      624            630        -6      840            830
##  9  2013     1     1      637            645        -8      930            935
## 10  2013     1     1      652            655        -3      932            921
## # ... with 49,372 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Mutate Function

flights_sml <- select( flights,
                       year:day,
                       ends_with("delay"),
                       distance,
                       air_time
                    )
flights_sml_add<- mutate(flights_sml,
                         gain = dep_delay - arr_delay,
                         speed = distance / air_time *60,
                         hours= air_time /60,
                         gain_per_hours = gain /hours)
transmute(flights, 
          gain = dep_delay - arr_delay,
          hours= air_time /60,
          gain_per_hours = gain /hours)
## # A tibble: 336,776 x 3
##     gain hours gain_per_hours
##    <dbl> <dbl>          <dbl>
##  1    -9 3.78           -2.38
##  2   -16 3.78           -4.23
##  3   -31 2.67          -11.6 
##  4    17 3.05            5.57
##  5    19 1.93            9.83
##  6   -16 2.5            -6.4 
##  7   -24 2.63           -9.11
##  8    11 0.883          12.5 
##  9     5 2.33            2.14
## 10   -10 2.3            -4.35
## # ... with 336,766 more rows
transmute_test <-transmute(flights, 
                          gain = dep_delay - arr_delay,
                          hours= air_time /60,
                          gain_per_hours = gain /hours)
transmute_test2 <-transmute(flights_sml, 
                           gain = dep_delay - arr_delay,
                           hours= air_time /60,
                           gain_per_hours = gain /hours)

summarise function

summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 1 x 1
##   delay
##   <dbl>
## 1  12.6
(by_dest <- group_by(flights, dest))
## # A tibble: 336,776 x 19
## # Groups:   dest [105]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
delay <- summarise(by_dest,
                   count = n(),
                   dist = mean(distance, na.rm = TRUE),
                   delay = mean(arr_delay, na.rm = TRUE)
)
## `summarise()` ungrouping output (override with `.groups` argument)
delay <- filter(delay, count > 20, dest != "HNL")

ggplot

ggplot(data= delay, mapping = aes(x= dist ,y= delay ))+
  geom_point(aes(size =count), alpha= 1/3)+
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

using pipes for speed

delay_s <- flights %>%
  group_by(dest)%>%
  summarise(count = n(),
            dist = mean(distance, na.rm = TRUE),
            delay = mean(arr_delay, na.rm = TRUE
              ))%>%
            filter(count > 20 , dest != "HNL")
## `summarise()` ungrouping output (override with `.groups` argument)

missing value

summary(fl$dep_delay) # 8255 NA's in there
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -43.00   -5.00   -2.00   12.64   11.00 1301.00    8255
not_cancelled <- flights %>% 
  filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(mean = mean(dep_delay))
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.4 
##  2  2013     1     2 13.7 
##  3  2013     1     3 10.9 
##  4  2013     1     4  8.97
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.56
##  9  2013     1     9  2.30
## 10  2013     1    10  2.84
## # ... with 355 more rows
delays <- not_cancelled %>% 
  group_by(tailnum) %>% 
  summarise(
    delay = mean(arr_delay, na.rm = TRUE),
    n = n()
  )
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(data = delays, mapping = aes(x = n, y = delay)) + 
  geom_point(alpha = 1/10)

# do filter with the same previous graph
delays %>% 
  filter(n > 25) %>% 
  ggplot(mapping = aes(x = n, y = delay)) + 
  geom_point(alpha = 1/10)