- Cari Datasets selain iris dan Teams.
- Dataset nycflights13
# Mencari dataset
# Dataset nycflights13
library(nycflights13)
## Warning: package 'nycflights13' was built under R version 4.0.5
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.3 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
nycflights13::flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
data("flights")
flights <- tibble::as_tibble(flights)
flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
- Penggunaan fungsi summarise(), arrange(), filter(),mutate(), select() minimal 1 kali secara terpisah.
# Fungsi summarize()
flights %>% group_by(carrier) %>% summarize(mean=mean(dep_time,na.rm = TRUE), .groups='drop')
## # A tibble: 16 x 2
## carrier mean
## <chr> <dbl>
## 1 9E 1487.
## 2 AA 1297.
## 3 AS 1295.
## 4 B6 1381.
## 5 DL 1351.
## 6 EV 1369.
## 7 F9 1438.
## 8 FL 1387.
## 9 HA 949.
## 10 MQ 1393.
## 11 OO 1725.
## 12 UA 1327.
## 13 US 1231.
## 14 VX 1280.
## 15 WN 1281.
## 16 YV 1601.
# Fungsi arrange()
flights %>% arrange(dep_time)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 13 1 2249 72 108 2357
## 2 2013 1 31 1 2100 181 124 2225
## 3 2013 11 13 1 2359 2 442 440
## 4 2013 12 16 1 2359 2 447 437
## 5 2013 12 20 1 2359 2 430 440
## 6 2013 12 26 1 2359 2 437 440
## 7 2013 12 30 1 2359 2 441 437
## 8 2013 2 11 1 2100 181 111 2225
## 9 2013 2 24 1 2245 76 121 2354
## 10 2013 3 8 1 2355 6 431 440
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>% arrange(desc(dep_time))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 10 30 2400 2359 1 327 337
## 2 2013 11 27 2400 2359 1 515 445
## 3 2013 12 5 2400 2359 1 427 440
## 4 2013 12 9 2400 2359 1 432 440
## 5 2013 12 9 2400 2250 70 59 2356
## 6 2013 12 13 2400 2359 1 432 440
## 7 2013 12 19 2400 2359 1 434 440
## 8 2013 12 29 2400 1700 420 302 2025
## 9 2013 2 7 2400 2359 1 432 436
## 10 2013 2 7 2400 2359 1 443 444
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# Fungsi filter()
flights %>% filter(carrier=="AA")
## # A tibble: 32,729 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 542 540 2 923 850
## 2 2013 1 1 558 600 -2 753 745
## 3 2013 1 1 559 600 -1 941 910
## 4 2013 1 1 606 610 -4 858 910
## 5 2013 1 1 623 610 13 920 915
## 6 2013 1 1 628 630 -2 1137 1140
## 7 2013 1 1 629 630 -1 824 810
## 8 2013 1 1 635 635 0 1028 940
## 9 2013 1 1 656 700 -4 854 850
## 10 2013 1 1 656 659 -3 949 959
## # ... with 32,719 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# Fungsi mutate()
flights2 <- flights %>% mutate(differance=arr_time-dep_time)
# Fungsi select
flights2 %>% select(dep_time,dep_delay,arr_time,arr_delay,carrier,differance)
## # A tibble: 336,776 x 6
## dep_time dep_delay arr_time arr_delay carrier differance
## <int> <dbl> <int> <dbl> <chr> <int>
## 1 517 2 830 11 UA 313
## 2 533 4 850 20 UA 317
## 3 542 2 923 33 AA 381
## 4 544 -1 1004 -18 B6 460
## 5 554 -6 812 -25 DL 258
## 6 554 -4 740 12 UA 186
## 7 555 -5 913 19 B6 358
## 8 557 -3 709 -14 EV 152
## 9 557 -3 838 -8 B6 281
## 10 558 -2 753 8 AA 195
## # ... with 336,766 more rows
- Penggunaan fungsi tersebut secara bersama-sama.(jumlahnya bebas)
# Fungsi select() dan filter()
flights %>% select(dep_time,dep_delay,arr_time,arr_delay,carrier)%>%
filter(carrier=="AA" & dep_time > 2000)
## # A tibble: 1,506 x 5
## dep_time dep_delay arr_time arr_delay carrier
## <int> <dbl> <int> <dbl> <chr>
## 1 2002 7 2306 -4 AA
## 2 2013 -2 2120 -10 AA
## 3 2030 -15 2150 -35 AA
## 4 2128 -7 26 -24 AA
## 5 2205 285 46 246 AA
## 6 2003 8 2309 -1 AA
## 7 2023 8 2132 2 AA
## 8 2041 -4 2210 -15 AA
## 9 2134 -1 36 -14 AA
## 10 2015 0 2116 -14 AA
## # ... with 1,496 more rows
# Fungsi filter(),select(), dan arrange()
flights %>% filter(carrier=="UA" & dep_time < 2000) %>%
select(dep_time,dep_delay,arr_time,arr_delay,carrier) %>% arrange(desc(dep_time))
## # A tibble: 52,138 x 5
## dep_time dep_delay arr_time arr_delay carrier
## <int> <dbl> <int> <dbl> <chr>
## 1 1959 -1 2310 3 UA
## 2 1959 -2 2133 -6 UA
## 3 1959 -6 2319 8 UA
## 4 1959 -6 2257 -18 UA
## 5 1959 -6 2233 -25 UA
## 6 1959 -6 2233 -26 UA
## 7 1959 -1 2312 -16 UA
## 8 1959 24 2233 -9 UA
## 9 1959 58 2100 20 UA
## 10 1959 -1 2056 -24 UA
## # ... with 52,128 more rows