1. Cari Datasets selain iris dan Teams.
  2. Dataset nycflights13
# Mencari dataset
# Dataset nycflights13
library(nycflights13)
## Warning: package 'nycflights13' was built under R version 4.0.5
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.3     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
nycflights13::flights
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
data("flights")
flights <- tibble::as_tibble(flights)
flights
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
  1. Penggunaan fungsi summarise(), arrange(), filter(),mutate(), select() minimal 1 kali secara terpisah.
# Fungsi summarize()
flights %>% group_by(carrier) %>% summarize(mean=mean(dep_time,na.rm = TRUE), .groups='drop')
## # A tibble: 16 x 2
##    carrier  mean
##    <chr>   <dbl>
##  1 9E      1487.
##  2 AA      1297.
##  3 AS      1295.
##  4 B6      1381.
##  5 DL      1351.
##  6 EV      1369.
##  7 F9      1438.
##  8 FL      1387.
##  9 HA       949.
## 10 MQ      1393.
## 11 OO      1725.
## 12 UA      1327.
## 13 US      1231.
## 14 VX      1280.
## 15 WN      1281.
## 16 YV      1601.
# Fungsi arrange()
flights %>% arrange(dep_time)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1    13        1           2249        72      108           2357
##  2  2013     1    31        1           2100       181      124           2225
##  3  2013    11    13        1           2359         2      442            440
##  4  2013    12    16        1           2359         2      447            437
##  5  2013    12    20        1           2359         2      430            440
##  6  2013    12    26        1           2359         2      437            440
##  7  2013    12    30        1           2359         2      441            437
##  8  2013     2    11        1           2100       181      111           2225
##  9  2013     2    24        1           2245        76      121           2354
## 10  2013     3     8        1           2355         6      431            440
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>% arrange(desc(dep_time))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    10    30     2400           2359         1      327            337
##  2  2013    11    27     2400           2359         1      515            445
##  3  2013    12     5     2400           2359         1      427            440
##  4  2013    12     9     2400           2359         1      432            440
##  5  2013    12     9     2400           2250        70       59           2356
##  6  2013    12    13     2400           2359         1      432            440
##  7  2013    12    19     2400           2359         1      434            440
##  8  2013    12    29     2400           1700       420      302           2025
##  9  2013     2     7     2400           2359         1      432            436
## 10  2013     2     7     2400           2359         1      443            444
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# Fungsi filter()
flights %>% filter(carrier=="AA")
## # A tibble: 32,729 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      542            540         2      923            850
##  2  2013     1     1      558            600        -2      753            745
##  3  2013     1     1      559            600        -1      941            910
##  4  2013     1     1      606            610        -4      858            910
##  5  2013     1     1      623            610        13      920            915
##  6  2013     1     1      628            630        -2     1137           1140
##  7  2013     1     1      629            630        -1      824            810
##  8  2013     1     1      635            635         0     1028            940
##  9  2013     1     1      656            700        -4      854            850
## 10  2013     1     1      656            659        -3      949            959
## # ... with 32,719 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# Fungsi mutate()
flights2 <- flights %>% mutate(differance=arr_time-dep_time)
# Fungsi select
flights2 %>% select(dep_time,dep_delay,arr_time,arr_delay,carrier,differance)
## # A tibble: 336,776 x 6
##    dep_time dep_delay arr_time arr_delay carrier differance
##       <int>     <dbl>    <int>     <dbl> <chr>        <int>
##  1      517         2      830        11 UA             313
##  2      533         4      850        20 UA             317
##  3      542         2      923        33 AA             381
##  4      544        -1     1004       -18 B6             460
##  5      554        -6      812       -25 DL             258
##  6      554        -4      740        12 UA             186
##  7      555        -5      913        19 B6             358
##  8      557        -3      709       -14 EV             152
##  9      557        -3      838        -8 B6             281
## 10      558        -2      753         8 AA             195
## # ... with 336,766 more rows
  1. Penggunaan fungsi tersebut secara bersama-sama.(jumlahnya bebas)
# Fungsi select() dan filter()
flights %>% select(dep_time,dep_delay,arr_time,arr_delay,carrier)%>%
  filter(carrier=="AA" & dep_time > 2000)
## # A tibble: 1,506 x 5
##    dep_time dep_delay arr_time arr_delay carrier
##       <int>     <dbl>    <int>     <dbl> <chr>  
##  1     2002         7     2306        -4 AA     
##  2     2013        -2     2120       -10 AA     
##  3     2030       -15     2150       -35 AA     
##  4     2128        -7       26       -24 AA     
##  5     2205       285       46       246 AA     
##  6     2003         8     2309        -1 AA     
##  7     2023         8     2132         2 AA     
##  8     2041        -4     2210       -15 AA     
##  9     2134        -1       36       -14 AA     
## 10     2015         0     2116       -14 AA     
## # ... with 1,496 more rows
# Fungsi filter(),select(), dan arrange()
flights %>% filter(carrier=="UA" & dep_time < 2000) %>% 
  select(dep_time,dep_delay,arr_time,arr_delay,carrier) %>% arrange(desc(dep_time))
## # A tibble: 52,138 x 5
##    dep_time dep_delay arr_time arr_delay carrier
##       <int>     <dbl>    <int>     <dbl> <chr>  
##  1     1959        -1     2310         3 UA     
##  2     1959        -2     2133        -6 UA     
##  3     1959        -6     2319         8 UA     
##  4     1959        -6     2257       -18 UA     
##  5     1959        -6     2233       -25 UA     
##  6     1959        -6     2233       -26 UA     
##  7     1959        -1     2312       -16 UA     
##  8     1959        24     2233        -9 UA     
##  9     1959        58     2100        20 UA     
## 10     1959        -1     2056       -24 UA     
## # ... with 52,128 more rows