Baixando e carregando os pacotes a serem usados no desenvolvimento:
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("nycflights13")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(nycflights13)
#renomeando o banco de dados:
flights <- nycflights13::flights
flights
## # A tibble: 336,776 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 533 529 4 850 830 20 UA
## 3 2013 1 1 542 540 2 923 850 33 AA
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6
## 5 2013 1 1 554 600 -6 812 837 -25 DL
## 6 2013 1 1 554 558 -4 740 728 12 UA
## 7 2013 1 1 555 600 -5 913 854 19 B6
## 8 2013 1 1 557 600 -3 709 723 -14 EV
## 9 2013 1 1 557 600 -3 838 846 -8 B6
## 10 2013 1 1 558 600 -2 753 745 8 AA
## # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
?flights
#Usando a função filter:
a = filter(flights, arr_delay >= 120)
a
## # A tibble: 10,200 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 811 630 101 1047 830 137 MQ
## 2 2013 1 1 848 1835 853 1001 1950 851 MQ
## 3 2013 1 1 957 733 144 1056 853 123 UA
## 4 2013 1 1 1114 900 134 1447 1222 145 UA
## 5 2013 1 1 1505 1310 115 1638 1431 127 EV
## 6 2013 1 1 1525 1340 105 1831 1626 125 B6
## 7 2013 1 1 1549 1445 64 1912 1656 136 EV
## 8 2013 1 1 1558 1359 119 1718 1515 123 EV
## 9 2013 1 1 1732 1630 62 2028 1825 123 EV
## 10 2013 1 1 1803 1620 103 2008 1750 138 MQ
## # … with 10,190 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
b = filter(flights, dest == c("IAH", "HOU"))
b
## # A tibble: 4,655 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 623 627 -4 933 932 1 UA
## 3 2013 1 1 1028 1026 2 1350 1339 11 UA
## 4 2013 1 1 1114 900 134 1447 1222 145 UA
## 5 2013 1 1 1208 1158 10 1540 1502 38 B6
## 6 2013 1 1 1306 1300 6 1622 1610 12 WN
## 7 2013 1 1 1527 1515 12 1854 1810 44 UA
## 8 2013 1 1 1620 1620 0 1945 1922 23 UA
## 9 2013 1 1 1725 1720 5 2045 2021 24 UA
## 10 2013 1 1 1855 1848 7 2203 2200 3 UA
## # … with 4,645 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
#United Airlines = UA, American Arlines = AA, Delta Airlines = DL
c = filter(flights, carrier == c("UA","AA", "DL"))
## Warning in carrier == c("UA", "AA", "DL"): longer object length is not a
## multiple of shorter object length
c
## # A tibble: 46,913 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 558 600 -2 924 917 7 UA
## 3 2013 1 1 602 610 -8 812 820 -8 DL
## 4 2013 1 1 606 610 -4 858 910 -12 AA
## 5 2013 1 1 606 610 -4 837 845 -8 DL
## 6 2013 1 1 607 607 0 858 915 -17 UA
## 7 2013 1 1 615 615 0 833 842 -9 DL
## 8 2013 1 1 623 610 13 920 915 5 AA
## 9 2013 1 1 643 646 -3 922 940 -18 UA
## 10 2013 1 1 653 700 -7 936 1009 -33 DL
## # … with 46,903 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
d = filter(flights, month == 7 & 8 & 9)
d
## # A tibble: 29,425 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 7 1 1 2029 212 236 2359 157 B6
## 2 2013 7 1 2 2359 3 344 344 0 B6
## 3 2013 7 1 29 2245 104 151 1 110 B6
## 4 2013 7 1 43 2130 193 322 14 188 B6
## 5 2013 7 1 44 2150 174 300 100 120 AA
## 6 2013 7 1 46 2051 235 304 2358 186 B6
## 7 2013 7 1 48 2001 287 308 2305 243 VX
## 8 2013 7 1 58 2155 183 335 43 172 B6
## 9 2013 7 1 100 2146 194 327 30 177 B6
## 10 2013 7 1 100 2245 135 337 135 122 B6
## # … with 29,415 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
e = filter(flights, dep_delay <= 0 & arr_delay >= 120)
e
## # A tibble: 29 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 27 1419 1420 -1 1754 1550 124 MQ
## 2 2013 10 7 1350 1350 0 1736 1526 130 EV
## 3 2013 10 7 1357 1359 -2 1858 1654 124 AA
## 4 2013 10 16 657 700 -3 1258 1056 122 B6
## 5 2013 11 1 658 700 -2 1329 1015 194 VX
## 6 2013 3 18 1844 1847 -3 39 2219 140 UA
## 7 2013 4 17 1635 1640 -5 2049 1845 124 MQ
## 8 2013 4 18 558 600 -2 1149 850 179 AA
## 9 2013 4 18 655 700 -5 1213 950 143 AA
## 10 2013 5 22 1827 1830 -3 2217 2010 127 MQ
## # … with 19 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
f = filter(flights, dep_time == 0000 : 600)
## Warning in dep_time == 0:600: longer object length is not a multiple of shorter
## object length
f
## # A tibble: 25 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 30 556 600 -4 709 658 11 US
## 2 2013 1 30 557 600 -3 711 709 2 B6
## 3 2013 1 30 559 601 -2 739 725 14 EV
## 4 2013 12 11 542 545 -3 841 832 9 UA
## 5 2013 12 11 544 550 -6 1021 1027 -6 B6
## 6 2013 12 11 557 600 -3 853 846 7 B6
## 7 2013 2 20 557 600 -3 733 745 -12 AA
## 8 2013 2 24 556 600 -4 914 909 5 UA
## 9 2013 3 30 550 600 -10 721 759 -38 DL
## 10 2013 3 30 555 600 -5 804 829 -25 DL
## # … with 15 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
delayed = arrange(flights, desc(dep_delay))
flights %>% arrange(desc(arr_delay))
## # A tibble: 336,776 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 9 641 900 1301 1242 1530 1272 HA
## 2 2013 6 15 1432 1935 1137 1607 2120 1127 MQ
## 3 2013 1 10 1121 1635 1126 1239 1810 1109 MQ
## 4 2013 9 20 1139 1845 1014 1457 2210 1007 AA
## 5 2013 7 22 845 1600 1005 1044 1815 989 MQ
## 6 2013 4 10 1100 1900 960 1342 2211 931 DL
## 7 2013 3 17 2321 810 911 135 1020 915 DL
## 8 2013 7 22 2257 759 898 121 1026 895 DL
## 9 2013 12 5 756 1700 896 1058 2020 878 AA
## 10 2013 5 3 1133 2055 878 1250 2215 875 MQ
## # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
delayed
## # A tibble: 336,776 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 9 641 900 1301 1242 1530 1272 HA
## 2 2013 6 15 1432 1935 1137 1607 2120 1127 MQ
## 3 2013 1 10 1121 1635 1126 1239 1810 1109 MQ
## 4 2013 9 20 1139 1845 1014 1457 2210 1007 AA
## 5 2013 7 22 845 1600 1005 1044 1815 989 MQ
## 6 2013 4 10 1100 1900 960 1342 2211 931 DL
## 7 2013 3 17 2321 810 911 135 1020 915 DL
## 8 2013 6 27 959 1900 899 1236 2226 850 DL
## 9 2013 7 22 2257 759 898 121 1026 895 DL
## 10 2013 12 5 756 1700 896 1058 2020 878 AA
## # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
fastest = mutate(flights, FlightSpeed = (distance/air_time)*60)
fastest = arrange(fastest, desc(FlightSpeed))
fastest
## # A tibble: 336,776 × 20
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 5 25 1709 1700 9 1923 1937 -14 DL
## 2 2013 7 2 1558 1513 45 1745 1719 26 EV
## 3 2013 5 13 2040 2025 15 2225 2226 -1 EV
## 4 2013 3 23 1914 1910 4 2045 2043 2 EV
## 5 2013 1 12 1559 1600 -1 1849 1917 -28 DL
## 6 2013 11 17 650 655 -5 1059 1150 -51 DL
## 7 2013 2 21 2355 2358 -3 412 438 -26 B6
## 8 2013 11 17 759 800 -1 1212 1255 -43 AA
## 9 2013 11 16 2003 1925 38 17 36 -19 DL
## 10 2013 11 16 2349 2359 -10 402 440 -38 B6
## # … with 336,766 more rows, 10 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, FlightSpeed <dbl>, and abbreviated variable
## # names ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
# independente de quantas vezes ela for repetida dentro da função select(), a variável apenas aparece uma vez quando rodamos o programa.
o_que_acontece = flights %>% select(origin, origin, origin, origin, origin)
o_que_acontece
## # A tibble: 336,776 × 1
## origin
## <chr>
## 1 EWR
## 2 LGA
## 3 JFK
## 4 JFK
## 5 LGA
## 6 EWR
## 7 EWR
## 8 LGA
## 9 JFK
## 10 LGA
## # … with 336,766 more rows
#Procurando o signifado da Função min_rank()
?min_rank()
#Usando arrange e a função min_rank()
TenDelayed = arrange(flights, min_rank(desc(dep_delay)))
TenDelayed = mutate(TenDelayed, Ten_most_delayed = dep_delay)
TenDelayed
## # A tibble: 336,776 × 20
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 9 641 900 1301 1242 1530 1272 HA
## 2 2013 6 15 1432 1935 1137 1607 2120 1127 MQ
## 3 2013 1 10 1121 1635 1126 1239 1810 1109 MQ
## 4 2013 9 20 1139 1845 1014 1457 2210 1007 AA
## 5 2013 7 22 845 1600 1005 1044 1815 989 MQ
## 6 2013 4 10 1100 1900 960 1342 2211 931 DL
## 7 2013 3 17 2321 810 911 135 1020 915 DL
## 8 2013 6 27 959 1900 899 1236 2226 850 DL
## 9 2013 7 22 2257 759 898 121 1026 895 DL
## 10 2013 12 5 756 1700 896 1058 2020 878 AA
## # … with 336,766 more rows, 10 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, Ten_most_delayed <dbl>, and abbreviated
## # variable names ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time,
## # ⁵arr_delay
#pesquisando como usar a função de média aritmética
?mean()
#calculando a média de tempo de vôo em uma nova coluna
average = mutate(flights, average_air_time = mean(air_time, na.rm = TRUE))
average
## # A tibble: 336,776 × 20
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 533 529 4 850 830 20 UA
## 3 2013 1 1 542 540 2 923 850 33 AA
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6
## 5 2013 1 1 554 600 -6 812 837 -25 DL
## 6 2013 1 1 554 558 -4 740 728 12 UA
## 7 2013 1 1 555 600 -5 913 854 19 B6
## 8 2013 1 1 557 600 -3 709 723 -14 EV
## 9 2013 1 1 557 600 -3 838 846 -8 B6
## 10 2013 1 1 558 600 -2 753 745 8 AA
## # … with 336,766 more rows, 10 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, average_air_time <dbl>, and abbreviated
## # variable names ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time,
## # ⁵arr_delay