#install.packages("nycflights13")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(nycflights13)
## Warning: 套件 'nycflights13' 是用 R 版本 4.4.2 來建造的
#Classic R code
rnorm( 100, mean = 4, sd = 4)
## [1] 10.09929549 4.57415630 1.51439615 5.05001243 8.33209807 -3.62712983
## [7] 6.53623408 6.52549165 -5.05627993 6.65848200 4.59935987 2.73155425
## [13] 6.70180671 -4.47402498 1.89500226 8.07320801 4.29037062 -4.26474967
## [19] 7.60806279 3.45131482 6.49388509 8.97476586 7.28627710 0.84208284
## [25] 2.40231109 3.75962766 2.76906049 3.88340044 -1.37857401 7.76302407
## [31] -0.01490318 5.37902524 1.84777001 4.68352428 5.35689502 9.37643655
## [37] -1.86217368 -1.90696952 8.77225151 3.99914802 1.26704461 0.18783125
## [43] 3.76078113 4.05831510 6.72023190 0.27941589 4.10941456 2.31560538
## [49] 4.80448967 -3.18258622 -1.62277178 -0.04611483 1.66627828 -0.32820608
## [55] 8.89934539 1.96100010 3.01368079 6.45692214 8.22669100 3.86160751
## [61] 3.33325058 7.87146078 14.43764690 -0.66631272 8.04676132 2.87224839
## [67] 1.55596256 0.73564499 9.25107721 0.96713231 0.05793790 8.67449310
## [73] 1.64999915 13.31367571 5.10144667 -2.47286085 -1.06371058 3.36505755
## [79] 3.22946924 5.36245750 6.13676361 3.20368332 -0.30795962 12.12934560
## [85] 5.00360705 7.71108473 9.77164268 3.35511554 3.31055691 6.86147018
## [91] 7.04930343 6.45729607 -4.13674753 11.02473671 6.35548077 1.45916964
## [97] 0.54991293 0.17019901 5.84354858 2.96003546
mean( rnorm( 100, mean = 4, sd = 4), trim = .1 )
## [1] 3.51543
# Pipeline R code with %>%
100%>%
rnorm(mean = 4, sd = 4)%>%
mean(trim = .1)
## [1] 4.234624
## dplyr ##
# dataset "flight" from package:nycflights13
## look at the data: flights ##
head(flights)
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
str(flights)
## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
## $ year : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr [1:336776] "UA" "UA" "AA" "B6" ...
## $ flight : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num [1:336776] 1400 1416 1089 1576 762 ...
## $ hour : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
summary(flights)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906
## Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## NA's :8255
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000
## 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124 1st Qu.: -17.000
## Median : -2.00 Median :1535 Median :1556 Median : -5.000
## Mean : 12.64 Mean :1502 Mean :1536 Mean : 6.895
## 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945 3rd Qu.: 14.000
## Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
## NA's :8255 NA's :8713 NA's :9430
## carrier flight tailnum origin
## Length:336776 Min. : 1 Length:336776 Length:336776
## Class :character 1st Qu.: 553 Class :character Class :character
## Mode :character Median :1496 Mode :character Mode :character
## Mean :1972
## 3rd Qu.:3465
## Max. :8500
##
## dest air_time distance hour
## Length:336776 Min. : 20.0 Min. : 17 Min. : 1.00
## Class :character 1st Qu.: 82.0 1st Qu.: 502 1st Qu.: 9.00
## Mode :character Median :129.0 Median : 872 Median :13.00
## Mean :150.7 Mean :1040 Mean :13.18
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
## Max. :695.0 Max. :4983 Max. :23.00
## NA's :9430
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00.00
## 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00.00
## Median :29.00 Median :2013-07-03 10:00:00.00
## Mean :26.23 Mean :2013-07-03 05:22:54.64
## 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00.00
## Max. :59.00 Max. :2013-12-31 23:00:00.00
##
#############################
##### filter:篩選 rows #####
#############################
## 篩選 flights 中 month=1 且 day =1
filter( flights, month==1, day==1)
## # A tibble: 842 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 832 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights%>%
filter( month==1, day==1)
## # A tibble: 842 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 832 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
## 篩選 flights 中 flight arr_delay < dep_delay
flights%>%
filter(arr_delay < dep_delay)
## # A tibble: 221,565 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 544 545 -1 1004 1022
## 2 2013 1 1 554 600 -6 812 837
## 3 2013 1 1 557 600 -3 709 723
## 4 2013 1 1 557 600 -3 838 846
## 5 2013 1 1 558 600 -2 853 856
## 6 2013 1 1 558 600 -2 923 937
## 7 2013 1 1 559 559 0 702 706
## 8 2013 1 1 559 600 -1 854 902
## 9 2013 1 1 600 600 0 851 858
## 10 2013 1 1 601 600 1 844 850
## # ℹ 221,555 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
## 篩選 flights 中 dep_delay < 10
flights%>%
filter(dep_delay<10)
## # A tibble: 242,828 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 242,818 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
## 篩選 flights 中 hour < 12 and arr_delay <=10
flights%>%
filter(hour < 12 , arr_delay <=10)
## # A tibble: 104,629 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 544 545 -1 1004 1022
## 2 2013 1 1 554 600 -6 812 837
## 3 2013 1 1 557 600 -3 709 723
## 4 2013 1 1 557 600 -3 838 846
## 5 2013 1 1 558 600 -2 753 745
## 6 2013 1 1 558 600 -2 849 851
## 7 2013 1 1 558 600 -2 853 856
## 8 2013 1 1 558 600 -2 924 917
## 9 2013 1 1 558 600 -2 923 937
## 10 2013 1 1 559 559 0 702 706
## # ℹ 104,619 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights%>%
filter(hour < 12 & arr_delay <=10)
## # A tibble: 104,629 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 544 545 -1 1004 1022
## 2 2013 1 1 554 600 -6 812 837
## 3 2013 1 1 557 600 -3 709 723
## 4 2013 1 1 557 600 -3 838 846
## 5 2013 1 1 558 600 -2 753 745
## 6 2013 1 1 558 600 -2 849 851
## 7 2013 1 1 558 600 -2 853 856
## 8 2013 1 1 558 600 -2 924 917
## 9 2013 1 1 558 600 -2 923 937
## 10 2013 1 1 559 559 0 702 706
## # ℹ 104,619 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights%>%
filter(hour < 12 | arr_delay <=10)
## # A tibble: 262,117 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 262,107 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
########################
##### arrange:排序 #####
########################
## 篩選 flights 中 hour < 8 然後 依照 year, month, day排序
flights%>%
filter(hour<8)%>%
arrange(year,month,day)
## # A tibble: 50,726 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 50,716 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
## 針對 dep_delay 由大到小排序
flights%>%
arrange( desc(dep_delay))
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 6 27 959 1900 899 1236 2226
## 9 2013 7 22 2257 759 898 121 1026
## 10 2013 12 5 756 1700 896 1058 2020
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
################################
##### select:篩選 columns #####
################################
dim(flights)
## [1] 336776 19
## 篩選名字為 year, month, day 的 columns
flights%>%
select(year,month,day)
## # A tibble: 336,776 × 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # ℹ 336,766 more rows
## 篩選 year到day 的 columns
flights%>%
select(year:day)
## # A tibble: 336,776 × 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # ℹ 336,766 more rows
flights%>%
select(1:3)
## # A tibble: 336,776 × 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # ℹ 336,766 more rows
## 不要 year到day 的 columns
flights%>%
select(-(year:day))
## # A tibble: 336,776 × 16
## dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
## <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 517 515 2 830 819 11 UA
## 2 533 529 4 850 830 20 UA
## 3 542 540 2 923 850 33 AA
## 4 544 545 -1 1004 1022 -18 B6
## 5 554 600 -6 812 837 -25 DL
## 6 554 558 -4 740 728 12 UA
## 7 555 600 -5 913 854 19 B6
## 8 557 600 -3 709 723 -14 EV
## 9 557 600 -3 838 846 -8 B6
## 10 558 600 -2 753 745 8 AA
## # ℹ 336,766 more rows
## # ℹ 9 more variables: flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
## select a 開頭的欄位 ##
flights%>%
select(starts_with("a"))
## # A tibble: 336,776 × 3
## arr_time arr_delay air_time
## <int> <dbl> <dbl>
## 1 830 11 227
## 2 850 20 227
## 3 923 33 160
## 4 1004 -18 183
## 5 812 -25 116
## 6 740 12 150
## 7 913 19 158
## 8 709 -14 53
## 9 838 -8 140
## 10 753 8 138
## # ℹ 336,766 more rows
################################
##### mutate:新增 columns #####
################################
## d1 = flights 新增一個col叫“gain" 為 arr_delay-dep_delay, 再加一個"speed" 為 distance/air_time*60
d1 = flights %>%
mutate( gain = arr_delay-dep_delay,
speed = distance/air_time*60)
## d2=
# 1. flights 新增一個col叫“gain" 為 arr_delay-dep_delay, 再加一個"speed" 為 distance/air_time*60
# 2. gain>0
d2 = flights %>%
mutate(gain = arr_delay-dep_delay,
speed = distance/air_time*60)%>%
filter(gain>0)%>%
arrange(desc(speed))
# 3. 針對 speed 由大到小排序
flights%>%
mutate(gain = arr_delay-dep_delay,
speed = distance/air_time*60) %>%
filter(gain>0) %>%
arrange( desc(speed))
## # A tibble: 98,799 × 21
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 2 21 2228 2230 -2 318 312
## 2 2013 3 11 1607 1550 17 2011 1950
## 3 2013 9 24 1544 1545 -1 1949 1939
## 4 2013 12 20 2158 2045 73 255 135
## 5 2013 2 10 549 540 9 1036 1016
## 6 2013 12 5 703 705 -2 1156 1156
## 7 2013 7 12 1925 1910 15 2247 2206
## 8 2013 1 13 20 2359 21 505 437
## 9 2013 12 6 657 656 1 1149 1145
## 10 2013 7 12 1515 1449 26 1816 1748
## # ℹ 98,789 more rows
## # ℹ 13 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, gain <dbl>, speed <dbl>
################################
##### summarise:匯總數據 ######
################################
mean(flights$dep_delay,na.rm=T)
## [1] 12.63907
flights%>%
summarise(delay = mean(dep_delay,na.rm=T))
## # A tibble: 1 × 1
## delay
## <dbl>
## 1 12.6
### aa中 針對 x 分組,然後計算y的平均 ##
aa=data.frame(x=c('a','a','b','b','c','c'),y=c(2,4,0,5,5,10))
aa%>%
group_by(x)%>%
summarise(mean.y = mean(y))
## # A tibble: 3 × 2
## x mean.y
## <chr> <dbl>
## 1 a 3
## 2 b 2.5
## 3 c 7.5
aa%>%
group_by(x)%>%
summarise("平均" = mean(y))
## # A tibble: 3 × 2
## x 平均
## <chr> <dbl>
## 1 a 3
## 2 b 2.5
## 3 c 7.5
## flights 中 計算出 每個tailnum的飛行次數,平均飛行距離,平均延遲抵達時間##
aa1 = flights %>%
group_by(tailnum) %>%
summarise(count = n(),
dist = mean(distance, na.rm = T),
delay = mean(arr_delay,na.rm = T)) %>%
filter(is.finite(delay)) %>%
arrange( desc(count))
## flights 中 計算出 每個dest中 總共的飛機(tailnum)個數,總共的飛行次數,再根據飛行次數排序
flights %>%
group_by(dest) %>%
summarise(n_plane = n_distinct(tailnum),
n_flight = n()) %>%
arrange( desc(n_flight))
## # A tibble: 105 × 3
## dest n_plane n_flight
## <chr> <int> <int>
## 1 ORD 1214 17283
## 2 ATL 1180 17215
## 3 LAX 992 16174
## 4 BOS 1308 15508
## 5 MCO 1201 14082
## 6 CLT 822 14064
## 7 SFO 855 13331
## 8 FLL 1062 12055
## 9 MIA 1175 11728
## 10 DCA 470 9705
## # ℹ 95 more rows
#### join 系列 連接不同table ####
bb=data.frame(x=c('a','b','d'),
z=c('dd1','dd2','dd3'))
# join aa and bb #
aa
## x y
## 1 a 2
## 2 a 4
## 3 b 0
## 4 b 5
## 5 c 5
## 6 c 10
bb
## x z
## 1 a dd1
## 2 b dd2
## 3 d dd3
inner_join(aa,bb,by = c('x'))
## x y z
## 1 a 2 dd1
## 2 a 4 dd1
## 3 b 0 dd2
## 4 b 5 dd2
left_join(aa,bb,by = c('x'))
## x y z
## 1 a 2 dd1
## 2 a 4 dd1
## 3 b 0 dd2
## 4 b 5 dd2
## 5 c 5 <NA>
## 6 c 10 <NA>
right_join(aa,bb,by = c('x'))
## x y z
## 1 a 2 dd1
## 2 a 4 dd1
## 3 b 0 dd2
## 4 b 5 dd2
## 5 d NA dd3
anti_join(aa,bb,by = c('x'))
## x y
## 1 c 5
## 2 c 10
semi_join(aa,bb,by = c('x'))
## x y
## 1 a 2
## 2 a 4
## 3 b 0
## 4 b 5
full_join(aa,bb,by = c('x'))
## x y z
## 1 a 2 dd1
## 2 a 4 dd1
## 3 b 0 dd2
## 4 b 5 dd2
## 5 c 5 <NA>
## 6 c 10 <NA>
## 7 d NA dd3
## 1. 找出 每輛飛機的 飛行次數,平均飛行距離,平均delay時間 ##
aa1 = flights %>%
group_by(tailnum) %>%
summarise(count = n(),
dist = mean(distance, na.rm = T),
delay = mean(arr_delay, na.rm = T)) %>%
filter(is.finite(delay)) %>%
arrange( desc(count))
## 2. 並依此畫出 飛行距離 和 delay 時間, 再找出好的呈現方式
library(ggplot2)
ggplot(aa1, aes(x = dist , y = delay)) +
geom_point(aes(size = count),
alpha = 0.5,
color = "purple") +
labs(title = "飛行距離與延遲時間的關係",
x = "飛行距離",
y = "延遲時間(分鐘)",
size = "飛行次數") +
theme_minimal() +
geom_smooth(method = "lm", se = F, color = "black")
## `geom_smooth()` using formula = 'y ~ x'

## combine all ##
## A-1. 基本操作複習
library(ggplot2movies)
data(movies)
head(movies)
## # A tibble: 6 × 24
## title year length budget rating votes r1 r2 r3 r4 r5 r6
## <chr> <int> <int> <int> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 $ 1971 121 NA 6.4 348 4.5 4.5 4.5 4.5 14.5 24.5
## 2 $1000 a … 1939 71 NA 6 20 0 14.5 4.5 24.5 14.5 14.5
## 3 $21 a Da… 1941 7 NA 8.2 5 0 0 0 0 0 24.5
## 4 $40,000 1996 70 NA 8.2 6 14.5 0 0 0 0 0
## 5 $50,000 … 1975 71 NA 3.4 17 24.5 4.5 0 14.5 14.5 4.5
## 6 $pent 2000 91 NA 4.3 45 4.5 4.5 4.5 14.5 14.5 14.5
## # ℹ 12 more variables: r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>, mpaa <chr>,
## # Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## # Documentary <int>, Romance <int>, Short <int>
str(movies)
## tibble [58,788 × 24] (S3: tbl_df/tbl/data.frame)
## $ title : chr [1:58788] "$" "$1000 a Touchdown" "$21 a Day Once a Month" "$40,000" ...
## $ year : int [1:58788] 1971 1939 1941 1996 1975 2000 2002 2002 1987 1917 ...
## $ length : int [1:58788] 121 71 7 70 71 91 93 25 97 61 ...
## $ budget : int [1:58788] NA NA NA NA NA NA NA NA NA NA ...
## $ rating : num [1:58788] 6.4 6 8.2 8.2 3.4 4.3 5.3 6.7 6.6 6 ...
## $ votes : int [1:58788] 348 20 5 6 17 45 200 24 18 51 ...
## $ r1 : num [1:58788] 4.5 0 0 14.5 24.5 4.5 4.5 4.5 4.5 4.5 ...
## $ r2 : num [1:58788] 4.5 14.5 0 0 4.5 4.5 0 4.5 4.5 0 ...
## $ r3 : num [1:58788] 4.5 4.5 0 0 0 4.5 4.5 4.5 4.5 4.5 ...
## $ r4 : num [1:58788] 4.5 24.5 0 0 14.5 14.5 4.5 4.5 0 4.5 ...
## $ r5 : num [1:58788] 14.5 14.5 0 0 14.5 14.5 24.5 4.5 0 4.5 ...
## $ r6 : num [1:58788] 24.5 14.5 24.5 0 4.5 14.5 24.5 14.5 0 44.5 ...
## $ r7 : num [1:58788] 24.5 14.5 0 0 0 4.5 14.5 14.5 34.5 14.5 ...
## $ r8 : num [1:58788] 14.5 4.5 44.5 0 0 4.5 4.5 14.5 14.5 4.5 ...
## $ r9 : num [1:58788] 4.5 4.5 24.5 34.5 0 14.5 4.5 4.5 4.5 4.5 ...
## $ r10 : num [1:58788] 4.5 14.5 24.5 45.5 24.5 14.5 14.5 14.5 24.5 4.5 ...
## $ mpaa : chr [1:58788] "" "" "" "" ...
## $ Action : int [1:58788] 0 0 0 0 0 0 1 0 0 0 ...
## $ Animation : int [1:58788] 0 0 1 0 0 0 0 0 0 0 ...
## $ Comedy : int [1:58788] 1 1 0 1 0 0 0 0 0 0 ...
## $ Drama : int [1:58788] 1 0 0 0 0 1 1 0 1 0 ...
## $ Documentary: int [1:58788] 0 0 0 0 0 0 0 1 0 0 ...
## $ Romance : int [1:58788] 0 0 0 0 0 0 0 0 0 0 ...
## $ Short : int [1:58788] 0 0 1 0 0 0 0 1 0 0 ...
summary(movies)
## title year length budget
## Length:58788 Min. :1893 Min. : 1.00 Min. : 0
## Class :character 1st Qu.:1958 1st Qu.: 74.00 1st Qu.: 250000
## Mode :character Median :1983 Median : 90.00 Median : 3000000
## Mean :1976 Mean : 82.34 Mean : 13412513
## 3rd Qu.:1997 3rd Qu.: 100.00 3rd Qu.: 15000000
## Max. :2005 Max. :5220.00 Max. :200000000
## NA's :53573
## rating votes r1 r2
## Min. : 1.000 Min. : 5.0 Min. : 0.000 Min. : 0.000
## 1st Qu.: 5.000 1st Qu.: 11.0 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 6.100 Median : 30.0 Median : 4.500 Median : 4.500
## Mean : 5.933 Mean : 632.1 Mean : 7.014 Mean : 4.022
## 3rd Qu.: 7.000 3rd Qu.: 112.0 3rd Qu.: 4.500 3rd Qu.: 4.500
## Max. :10.000 Max. :157608.0 Max. :100.000 Max. :84.500
##
## r3 r4 r5 r6
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 4.500 1st Qu.: 4.50
## Median : 4.500 Median : 4.500 Median : 4.500 Median :14.50
## Mean : 4.721 Mean : 6.375 Mean : 9.797 Mean :13.04
## 3rd Qu.: 4.500 3rd Qu.: 4.500 3rd Qu.: 14.500 3rd Qu.:14.50
## Max. :84.500 Max. :100.000 Max. :100.000 Max. :84.50
##
## r7 r8 r9 r10
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 4.50 1st Qu.: 4.50 1st Qu.: 4.500 1st Qu.: 4.50
## Median : 14.50 Median : 14.50 Median : 4.500 Median : 14.50
## Mean : 15.55 Mean : 13.88 Mean : 8.954 Mean : 16.85
## 3rd Qu.: 24.50 3rd Qu.: 24.50 3rd Qu.: 14.500 3rd Qu.: 24.50
## Max. :100.00 Max. :100.00 Max. :100.000 Max. :100.00
##
## mpaa Action Animation Comedy
## Length:58788 Min. :0.00000 Min. :0.00000 Min. :0.0000
## Class :character 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Mode :character Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.07974 Mean :0.06277 Mean :0.2938
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.00000 Max. :1.0000
##
## Drama Documentary Romance Short
## Min. :0.000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.000 Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.371 Mean :0.05906 Mean :0.0807 Mean :0.1609
## 3rd Qu.:1.000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.000 Max. :1.00000 Max. :1.0000 Max. :1.0000
##
## a. 新增一欄位ratingGen為 rating 減去其平均值
mean_rating <- mean(movies$rating, na.rm = T)
movies <- movies %>%
mutate(ratingGen = rating - mean_rating)
##找出2000年後的電影
movies_after_2000 = movies %>%
filter(year > 2000)
## c. 篩選出前六 column (三種寫法)
movies%>%
select(1:6)
## # A tibble: 58,788 × 6
## title year length budget rating votes
## <chr> <int> <int> <int> <dbl> <int>
## 1 $ 1971 121 NA 6.4 348
## 2 $1000 a Touchdown 1939 71 NA 6 20
## 3 $21 a Day Once a Month 1941 7 NA 8.2 5
## 4 $40,000 1996 70 NA 8.2 6
## 5 $50,000 Climax Show, The 1975 71 NA 3.4 17
## 6 $pent 2000 91 NA 4.3 45
## 7 $windle 2002 93 NA 5.3 200
## 8 '15' 2002 25 NA 6.7 24
## 9 '38 1987 97 NA 6.6 18
## 10 '49-'17 1917 61 NA 6 51
## # ℹ 58,778 more rows
## A-2 計算出每年電影預算之平均值,並且畫出。
average_budget <- movies %>%
group_by(year) %>%
summarise(avg_budget = mean(budget, na.rm = T))
ggplot(average_budget, aes(x = year, y = avg_budget)) +
geom_line(color = "blue", size = 1) +
geom_point(color = "red", size = 2) +
labs(title = "每年電影預算的平均值",
x = "年份",
y = "平均預算 (美元)") +
theme_classic()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 16 rows containing missing values or values outside the scale range
## (`geom_point()`).

## A-3 找出1990年後 不同mpaa分級之戲劇片以及非劇情片的平均評價,並用ggplot畫出
##B -1. 找出 每輛飛機的 飛行次數,平均飛行距離,平均delay時間
ggplot(aa1, aes(x = dist , y = delay)) +
geom_point(aes(size = count),
alpha = 0.8,
color = "blue") +
labs(title = "飛行距離與延遲時間的關係",
x = "飛行距離",
y = "延遲時間(分鐘)",
size = "飛行次數") +
theme_classic()

## B - 2. 根據 B-1結果篩選出飛行次數>20,平均飛行距離<2000飛機,並依此畫出 飛行距離 和 delay時間關聯, 再找出好的呈現方式
cc1 = flights %>%
group_by(tailnum) %>%
summarise( count = n(),
dist = mean(distance, na.rm = T),
delay = mean(arr_delay, na.rm = T)) %>%
filter(count > 20, dist < 2000)
ggplot(cc1, aes(x = dist , y = delay)) +
geom_point(aes(size = count),
alpha = 0.8,
color = "pink") +
labs(title = "飛行距離與延遲時間的關係",
x = "飛行距離",
y = "延遲時間(分鐘)",
size = "飛行次數") +
theme_classic()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
