#install.packages("nycflights13")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(nycflights13)
## Warning: 套件 'nycflights13' 是用 R 版本 4.4.2 來建造的
#Classic    R   code
rnorm(  100,    mean    =   4,  sd  =   4)
##   [1] 10.09929549  4.57415630  1.51439615  5.05001243  8.33209807 -3.62712983
##   [7]  6.53623408  6.52549165 -5.05627993  6.65848200  4.59935987  2.73155425
##  [13]  6.70180671 -4.47402498  1.89500226  8.07320801  4.29037062 -4.26474967
##  [19]  7.60806279  3.45131482  6.49388509  8.97476586  7.28627710  0.84208284
##  [25]  2.40231109  3.75962766  2.76906049  3.88340044 -1.37857401  7.76302407
##  [31] -0.01490318  5.37902524  1.84777001  4.68352428  5.35689502  9.37643655
##  [37] -1.86217368 -1.90696952  8.77225151  3.99914802  1.26704461  0.18783125
##  [43]  3.76078113  4.05831510  6.72023190  0.27941589  4.10941456  2.31560538
##  [49]  4.80448967 -3.18258622 -1.62277178 -0.04611483  1.66627828 -0.32820608
##  [55]  8.89934539  1.96100010  3.01368079  6.45692214  8.22669100  3.86160751
##  [61]  3.33325058  7.87146078 14.43764690 -0.66631272  8.04676132  2.87224839
##  [67]  1.55596256  0.73564499  9.25107721  0.96713231  0.05793790  8.67449310
##  [73]  1.64999915 13.31367571  5.10144667 -2.47286085 -1.06371058  3.36505755
##  [79]  3.22946924  5.36245750  6.13676361  3.20368332 -0.30795962 12.12934560
##  [85]  5.00360705  7.71108473  9.77164268  3.35511554  3.31055691  6.86147018
##  [91]  7.04930343  6.45729607 -4.13674753 11.02473671  6.35548077  1.45916964
##  [97]  0.54991293  0.17019901  5.84354858  2.96003546
mean(   rnorm(  100,    mean    =   4,  sd  =   4), trim    =   .1  )
## [1] 3.51543
# Pipeline  R   code    with    %>%
100%>%
  rnorm(mean = 4, sd = 4)%>%
  mean(trim = .1)
## [1] 4.234624
## dplyr ##
# dataset "flight" from package:nycflights13

## look at the data: flights ##
head(flights)
## # A tibble: 6 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
str(flights)
## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
##  $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
##  $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
##  $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
summary(flights)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 106  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 906  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1359  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349   Mean   :1344  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##                                                  NA's   :8255                 
##    dep_delay          arr_time    sched_arr_time   arr_delay       
##  Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
##  1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124   1st Qu.: -17.000  
##  Median :  -2.00   Median :1535   Median :1556   Median :  -5.000  
##  Mean   :  12.64   Mean   :1502   Mean   :1536   Mean   :   6.895  
##  3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945   3rd Qu.:  14.000  
##  Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000  
##  NA's   :8255      NA's   :8713                  NA's   :9430      
##    carrier              flight       tailnum             origin         
##  Length:336776      Min.   :   1   Length:336776      Length:336776     
##  Class :character   1st Qu.: 553   Class :character   Class :character  
##  Mode  :character   Median :1496   Mode  :character   Mode  :character  
##                     Mean   :1972                                        
##                     3rd Qu.:3465                                        
##                     Max.   :8500                                        
##                                                                         
##      dest              air_time        distance         hour      
##  Length:336776      Min.   : 20.0   Min.   :  17   Min.   : 1.00  
##  Class :character   1st Qu.: 82.0   1st Qu.: 502   1st Qu.: 9.00  
##  Mode  :character   Median :129.0   Median : 872   Median :13.00  
##                     Mean   :150.7   Mean   :1040   Mean   :13.18  
##                     3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
##                     Max.   :695.0   Max.   :4983   Max.   :23.00  
##                     NA's   :9430                                  
##      minute        time_hour                     
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00.00  
##  1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00.00  
##  Median :29.00   Median :2013-07-03 10:00:00.00  
##  Mean   :26.23   Mean   :2013-07-03 05:22:54.64  
##  3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00.00  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00.00  
## 
#############################
##### filter:篩選 rows #####
#############################

## 篩選 flights 中 month=1 且 day =1
filter( flights, month==1, day==1)
## # A tibble: 842 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 832 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights%>%
  filter( month==1, day==1)
## # A tibble: 842 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 832 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
## 篩選 flights 中 flight arr_delay < dep_delay
flights%>%
  filter(arr_delay < dep_delay)
## # A tibble: 221,565 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      544            545        -1     1004           1022
##  2  2013     1     1      554            600        -6      812            837
##  3  2013     1     1      557            600        -3      709            723
##  4  2013     1     1      557            600        -3      838            846
##  5  2013     1     1      558            600        -2      853            856
##  6  2013     1     1      558            600        -2      923            937
##  7  2013     1     1      559            559         0      702            706
##  8  2013     1     1      559            600        -1      854            902
##  9  2013     1     1      600            600         0      851            858
## 10  2013     1     1      601            600         1      844            850
## # ℹ 221,555 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
## 篩選 flights 中 dep_delay < 10
flights%>%
  filter(dep_delay<10)
## # A tibble: 242,828 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 242,818 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
## 篩選 flights 中 hour < 12 and arr_delay <=10
flights%>%
  filter(hour < 12 , arr_delay <=10)
## # A tibble: 104,629 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      544            545        -1     1004           1022
##  2  2013     1     1      554            600        -6      812            837
##  3  2013     1     1      557            600        -3      709            723
##  4  2013     1     1      557            600        -3      838            846
##  5  2013     1     1      558            600        -2      753            745
##  6  2013     1     1      558            600        -2      849            851
##  7  2013     1     1      558            600        -2      853            856
##  8  2013     1     1      558            600        -2      924            917
##  9  2013     1     1      558            600        -2      923            937
## 10  2013     1     1      559            559         0      702            706
## # ℹ 104,619 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights%>%
  filter(hour < 12 & arr_delay <=10)
## # A tibble: 104,629 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      544            545        -1     1004           1022
##  2  2013     1     1      554            600        -6      812            837
##  3  2013     1     1      557            600        -3      709            723
##  4  2013     1     1      557            600        -3      838            846
##  5  2013     1     1      558            600        -2      753            745
##  6  2013     1     1      558            600        -2      849            851
##  7  2013     1     1      558            600        -2      853            856
##  8  2013     1     1      558            600        -2      924            917
##  9  2013     1     1      558            600        -2      923            937
## 10  2013     1     1      559            559         0      702            706
## # ℹ 104,619 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights%>%
  filter(hour < 12 | arr_delay <=10)
## # A tibble: 262,117 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 262,107 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
########################
##### arrange:排序 #####
########################

## 篩選 flights 中 hour < 8 然後 依照 year, month, day排序

  flights%>%
    filter(hour<8)%>%
    arrange(year,month,day)
## # A tibble: 50,726 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 50,716 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
## 針對 dep_delay 由大到小排序 
  flights%>% 
    arrange( desc(dep_delay))
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     6    27      959           1900       899     1236           2226
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013    12     5      756           1700       896     1058           2020
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
################################
##### select:篩選 columns #####
################################
dim(flights)
## [1] 336776     19
## 篩選名字為 year, month, day 的 columns
  flights%>%
  select(year,month,day)
## # A tibble: 336,776 × 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # ℹ 336,766 more rows
## 篩選 year到day 的 columns
flights%>%
  select(year:day)
## # A tibble: 336,776 × 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # ℹ 336,766 more rows
flights%>%
  select(1:3)
## # A tibble: 336,776 × 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # ℹ 336,766 more rows
## 不要 year到day 的 columns

flights%>%
  select(-(year:day))
## # A tibble: 336,776 × 16
##    dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
##       <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>  
##  1      517            515         2      830            819        11 UA     
##  2      533            529         4      850            830        20 UA     
##  3      542            540         2      923            850        33 AA     
##  4      544            545        -1     1004           1022       -18 B6     
##  5      554            600        -6      812            837       -25 DL     
##  6      554            558        -4      740            728        12 UA     
##  7      555            600        -5      913            854        19 B6     
##  8      557            600        -3      709            723       -14 EV     
##  9      557            600        -3      838            846        -8 B6     
## 10      558            600        -2      753            745         8 AA     
## # ℹ 336,766 more rows
## # ℹ 9 more variables: flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
## select a 開頭的欄位 ##
flights%>%
select(starts_with("a"))
## # A tibble: 336,776 × 3
##    arr_time arr_delay air_time
##       <int>     <dbl>    <dbl>
##  1      830        11      227
##  2      850        20      227
##  3      923        33      160
##  4     1004       -18      183
##  5      812       -25      116
##  6      740        12      150
##  7      913        19      158
##  8      709       -14       53
##  9      838        -8      140
## 10      753         8      138
## # ℹ 336,766 more rows
################################
##### mutate:新增 columns #####
################################

## d1 = flights 新增一個col叫“gain" 為 arr_delay-dep_delay, 再加一個"speed" 為 distance/air_time*60
d1 = flights %>%
mutate( gain = arr_delay-dep_delay,
        speed = distance/air_time*60)
## d2= 
#  1. flights 新增一個col叫“gain" 為 arr_delay-dep_delay, 再加一個"speed" 為 distance/air_time*60
#  2. gain>0
d2 = flights %>%
  mutate(gain = arr_delay-dep_delay,
         speed = distance/air_time*60)%>%
  filter(gain>0)%>%
  arrange(desc(speed))

#  3. 針對 speed 由大到小排序
flights%>%
  mutate(gain = arr_delay-dep_delay,
         speed = distance/air_time*60) %>%
filter(gain>0) %>%
arrange( desc(speed))
## # A tibble: 98,799 × 21
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     2    21     2228           2230        -2      318            312
##  2  2013     3    11     1607           1550        17     2011           1950
##  3  2013     9    24     1544           1545        -1     1949           1939
##  4  2013    12    20     2158           2045        73      255            135
##  5  2013     2    10      549            540         9     1036           1016
##  6  2013    12     5      703            705        -2     1156           1156
##  7  2013     7    12     1925           1910        15     2247           2206
##  8  2013     1    13       20           2359        21      505            437
##  9  2013    12     6      657            656         1     1149           1145
## 10  2013     7    12     1515           1449        26     1816           1748
## # ℹ 98,789 more rows
## # ℹ 13 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, gain <dbl>, speed <dbl>
################################
##### summarise:匯總數據 ######
################################
mean(flights$dep_delay,na.rm=T)
## [1] 12.63907
flights%>%
  summarise(delay = mean(dep_delay,na.rm=T))
## # A tibble: 1 × 1
##   delay
##   <dbl>
## 1  12.6
### aa中 針對 x 分組,然後計算y的平均 ##
aa=data.frame(x=c('a','a','b','b','c','c'),y=c(2,4,0,5,5,10))

aa%>%
  group_by(x)%>%
  summarise(mean.y = mean(y))
## # A tibble: 3 × 2
##   x     mean.y
##   <chr>  <dbl>
## 1 a        3  
## 2 b        2.5
## 3 c        7.5
aa%>%
  group_by(x)%>%
  summarise("平均" = mean(y))
## # A tibble: 3 × 2
##   x      平均
##   <chr> <dbl>
## 1 a       3  
## 2 b       2.5
## 3 c       7.5
## flights 中 計算出 每個tailnum的飛行次數,平均飛行距離,平均延遲抵達時間##
aa1 = flights %>%
  group_by(tailnum) %>%
  summarise(count = n(),
            dist = mean(distance, na.rm = T),
            delay = mean(arr_delay,na.rm = T)) %>%
  filter(is.finite(delay)) %>%
  arrange( desc(count))
  

## flights 中 計算出 每個dest中 總共的飛機(tailnum)個數,總共的飛行次數,再根據飛行次數排序
flights %>%
  group_by(dest) %>%
  summarise(n_plane = n_distinct(tailnum),
            n_flight = n()) %>%
  arrange( desc(n_flight))
## # A tibble: 105 × 3
##    dest  n_plane n_flight
##    <chr>   <int>    <int>
##  1 ORD      1214    17283
##  2 ATL      1180    17215
##  3 LAX       992    16174
##  4 BOS      1308    15508
##  5 MCO      1201    14082
##  6 CLT       822    14064
##  7 SFO       855    13331
##  8 FLL      1062    12055
##  9 MIA      1175    11728
## 10 DCA       470     9705
## # ℹ 95 more rows
#### join 系列 連接不同table ####

bb=data.frame(x=c('a','b','d'),
              z=c('dd1','dd2','dd3'))

# join aa and bb #
aa
##   x  y
## 1 a  2
## 2 a  4
## 3 b  0
## 4 b  5
## 5 c  5
## 6 c 10
bb
##   x   z
## 1 a dd1
## 2 b dd2
## 3 d dd3
inner_join(aa,bb,by = c('x'))
##   x y   z
## 1 a 2 dd1
## 2 a 4 dd1
## 3 b 0 dd2
## 4 b 5 dd2
left_join(aa,bb,by = c('x'))
##   x  y    z
## 1 a  2  dd1
## 2 a  4  dd1
## 3 b  0  dd2
## 4 b  5  dd2
## 5 c  5 <NA>
## 6 c 10 <NA>
right_join(aa,bb,by = c('x'))
##   x  y   z
## 1 a  2 dd1
## 2 a  4 dd1
## 3 b  0 dd2
## 4 b  5 dd2
## 5 d NA dd3
anti_join(aa,bb,by = c('x'))
##   x  y
## 1 c  5
## 2 c 10
semi_join(aa,bb,by = c('x'))
##   x y
## 1 a 2
## 2 a 4
## 3 b 0
## 4 b 5
full_join(aa,bb,by = c('x'))
##   x  y    z
## 1 a  2  dd1
## 2 a  4  dd1
## 3 b  0  dd2
## 4 b  5  dd2
## 5 c  5 <NA>
## 6 c 10 <NA>
## 7 d NA  dd3
## 1. 找出 每輛飛機的 飛行次數,平均飛行距離,平均delay時間  ##
aa1 = flights %>%
  group_by(tailnum) %>%
  summarise(count = n(),                     
            dist = mean(distance, na.rm = T), 
            delay = mean(arr_delay, na.rm = T)) %>%
  filter(is.finite(delay)) %>%
  arrange( desc(count))

## 2. 並依此畫出 飛行距離 和 delay 時間, 再找出好的呈現方式

library(ggplot2)
ggplot(aa1, aes(x = dist , y = delay)) +
  geom_point(aes(size = count), 
             alpha = 0.5, 
             color = "purple") +
  labs(title = "飛行距離與延遲時間的關係",
      x = "飛行距離",
      y = "延遲時間(分鐘)",
      size = "飛行次數") +
  theme_minimal() +
  geom_smooth(method = "lm", se = F, color = "black")
## `geom_smooth()` using formula = 'y ~ x'

## combine all ##

## A-1. 基本操作複習

library(ggplot2movies)
data(movies)
head(movies)
## # A tibble: 6 × 24
##   title      year length budget rating votes    r1    r2    r3    r4    r5    r6
##   <chr>     <int>  <int>  <int>  <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 $          1971    121     NA    6.4   348   4.5   4.5   4.5   4.5  14.5  24.5
## 2 $1000 a …  1939     71     NA    6      20   0    14.5   4.5  24.5  14.5  14.5
## 3 $21 a Da…  1941      7     NA    8.2     5   0     0     0     0     0    24.5
## 4 $40,000    1996     70     NA    8.2     6  14.5   0     0     0     0     0  
## 5 $50,000 …  1975     71     NA    3.4    17  24.5   4.5   0    14.5  14.5   4.5
## 6 $pent      2000     91     NA    4.3    45   4.5   4.5   4.5  14.5  14.5  14.5
## # ℹ 12 more variables: r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>, mpaa <chr>,
## #   Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## #   Documentary <int>, Romance <int>, Short <int>
str(movies)
## tibble [58,788 × 24] (S3: tbl_df/tbl/data.frame)
##  $ title      : chr [1:58788] "$" "$1000 a Touchdown" "$21 a Day Once a Month" "$40,000" ...
##  $ year       : int [1:58788] 1971 1939 1941 1996 1975 2000 2002 2002 1987 1917 ...
##  $ length     : int [1:58788] 121 71 7 70 71 91 93 25 97 61 ...
##  $ budget     : int [1:58788] NA NA NA NA NA NA NA NA NA NA ...
##  $ rating     : num [1:58788] 6.4 6 8.2 8.2 3.4 4.3 5.3 6.7 6.6 6 ...
##  $ votes      : int [1:58788] 348 20 5 6 17 45 200 24 18 51 ...
##  $ r1         : num [1:58788] 4.5 0 0 14.5 24.5 4.5 4.5 4.5 4.5 4.5 ...
##  $ r2         : num [1:58788] 4.5 14.5 0 0 4.5 4.5 0 4.5 4.5 0 ...
##  $ r3         : num [1:58788] 4.5 4.5 0 0 0 4.5 4.5 4.5 4.5 4.5 ...
##  $ r4         : num [1:58788] 4.5 24.5 0 0 14.5 14.5 4.5 4.5 0 4.5 ...
##  $ r5         : num [1:58788] 14.5 14.5 0 0 14.5 14.5 24.5 4.5 0 4.5 ...
##  $ r6         : num [1:58788] 24.5 14.5 24.5 0 4.5 14.5 24.5 14.5 0 44.5 ...
##  $ r7         : num [1:58788] 24.5 14.5 0 0 0 4.5 14.5 14.5 34.5 14.5 ...
##  $ r8         : num [1:58788] 14.5 4.5 44.5 0 0 4.5 4.5 14.5 14.5 4.5 ...
##  $ r9         : num [1:58788] 4.5 4.5 24.5 34.5 0 14.5 4.5 4.5 4.5 4.5 ...
##  $ r10        : num [1:58788] 4.5 14.5 24.5 45.5 24.5 14.5 14.5 14.5 24.5 4.5 ...
##  $ mpaa       : chr [1:58788] "" "" "" "" ...
##  $ Action     : int [1:58788] 0 0 0 0 0 0 1 0 0 0 ...
##  $ Animation  : int [1:58788] 0 0 1 0 0 0 0 0 0 0 ...
##  $ Comedy     : int [1:58788] 1 1 0 1 0 0 0 0 0 0 ...
##  $ Drama      : int [1:58788] 1 0 0 0 0 1 1 0 1 0 ...
##  $ Documentary: int [1:58788] 0 0 0 0 0 0 0 1 0 0 ...
##  $ Romance    : int [1:58788] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Short      : int [1:58788] 0 0 1 0 0 0 0 1 0 0 ...
summary(movies)
##     title                year          length            budget         
##  Length:58788       Min.   :1893   Min.   :   1.00   Min.   :        0  
##  Class :character   1st Qu.:1958   1st Qu.:  74.00   1st Qu.:   250000  
##  Mode  :character   Median :1983   Median :  90.00   Median :  3000000  
##                     Mean   :1976   Mean   :  82.34   Mean   : 13412513  
##                     3rd Qu.:1997   3rd Qu.: 100.00   3rd Qu.: 15000000  
##                     Max.   :2005   Max.   :5220.00   Max.   :200000000  
##                                                      NA's   :53573      
##      rating           votes                r1                r2        
##  Min.   : 1.000   Min.   :     5.0   Min.   :  0.000   Min.   : 0.000  
##  1st Qu.: 5.000   1st Qu.:    11.0   1st Qu.:  0.000   1st Qu.: 0.000  
##  Median : 6.100   Median :    30.0   Median :  4.500   Median : 4.500  
##  Mean   : 5.933   Mean   :   632.1   Mean   :  7.014   Mean   : 4.022  
##  3rd Qu.: 7.000   3rd Qu.:   112.0   3rd Qu.:  4.500   3rd Qu.: 4.500  
##  Max.   :10.000   Max.   :157608.0   Max.   :100.000   Max.   :84.500  
##                                                                        
##        r3               r4                r5                r6       
##  Min.   : 0.000   Min.   :  0.000   Min.   :  0.000   Min.   : 0.00  
##  1st Qu.: 0.000   1st Qu.:  0.000   1st Qu.:  4.500   1st Qu.: 4.50  
##  Median : 4.500   Median :  4.500   Median :  4.500   Median :14.50  
##  Mean   : 4.721   Mean   :  6.375   Mean   :  9.797   Mean   :13.04  
##  3rd Qu.: 4.500   3rd Qu.:  4.500   3rd Qu.: 14.500   3rd Qu.:14.50  
##  Max.   :84.500   Max.   :100.000   Max.   :100.000   Max.   :84.50  
##                                                                      
##        r7               r8               r9               r10        
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.000   Min.   :  0.00  
##  1st Qu.:  4.50   1st Qu.:  4.50   1st Qu.:  4.500   1st Qu.:  4.50  
##  Median : 14.50   Median : 14.50   Median :  4.500   Median : 14.50  
##  Mean   : 15.55   Mean   : 13.88   Mean   :  8.954   Mean   : 16.85  
##  3rd Qu.: 24.50   3rd Qu.: 24.50   3rd Qu.: 14.500   3rd Qu.: 24.50  
##  Max.   :100.00   Max.   :100.00   Max.   :100.000   Max.   :100.00  
##                                                                      
##      mpaa               Action          Animation           Comedy      
##  Length:58788       Min.   :0.00000   Min.   :0.00000   Min.   :0.0000  
##  Class :character   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Mode  :character   Median :0.00000   Median :0.00000   Median :0.0000  
##                     Mean   :0.07974   Mean   :0.06277   Mean   :0.2938  
##                     3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:1.0000  
##                     Max.   :1.00000   Max.   :1.00000   Max.   :1.0000  
##                                                                         
##      Drama        Documentary         Romance           Short       
##  Min.   :0.000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.000   Median :0.00000   Median :0.0000   Median :0.0000  
##  Mean   :0.371   Mean   :0.05906   Mean   :0.0807   Mean   :0.1609  
##  3rd Qu.:1.000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
## 
## a. 新增一欄位ratingGen為 rating 減去其平均值
mean_rating <- mean(movies$rating, na.rm = T)
movies <- movies %>%
  mutate(ratingGen = rating - mean_rating)
##找出2000年後的電影
movies_after_2000 = movies %>%
  filter(year > 2000)
## c. 篩選出前六 column (三種寫法)
movies%>%
  select(1:6)
## # A tibble: 58,788 × 6
##    title                     year length budget rating votes
##    <chr>                    <int>  <int>  <int>  <dbl> <int>
##  1 $                         1971    121     NA    6.4   348
##  2 $1000 a Touchdown         1939     71     NA    6      20
##  3 $21 a Day Once a Month    1941      7     NA    8.2     5
##  4 $40,000                   1996     70     NA    8.2     6
##  5 $50,000 Climax Show, The  1975     71     NA    3.4    17
##  6 $pent                     2000     91     NA    4.3    45
##  7 $windle                   2002     93     NA    5.3   200
##  8 '15'                      2002     25     NA    6.7    24
##  9 '38                       1987     97     NA    6.6    18
## 10 '49-'17                   1917     61     NA    6      51
## # ℹ 58,778 more rows
## A-2 計算出每年電影預算之平均值,並且畫出。
average_budget <- movies %>%
  group_by(year) %>%
  summarise(avg_budget = mean(budget, na.rm = T))
ggplot(average_budget, aes(x = year, y = avg_budget)) +
  geom_line(color = "blue", size = 1) +
  geom_point(color = "red", size = 2) +
  labs(title = "每年電影預算的平均值",
       x = "年份",
       y = "平均預算 (美元)") +
  theme_classic()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 16 rows containing missing values or values outside the scale range
## (`geom_point()`).

## A-3 找出1990年後 不同mpaa分級之戲劇片以及非劇情片的平均評價,並用ggplot畫出
##B -1. 找出 每輛飛機的 飛行次數,平均飛行距離,平均delay時間
ggplot(aa1, aes(x = dist , y = delay)) +
  geom_point(aes(size = count), 
             alpha = 0.8, 
             color = "blue") +
  labs(title = "飛行距離與延遲時間的關係",
       x = "飛行距離",
       y = "延遲時間(分鐘)",
       size = "飛行次數") +
  theme_classic() 

## B - 2. 根據 B-1結果篩選出飛行次數>20,平均飛行距離<2000飛機,並依此畫出 飛行距離 和 delay時間關聯, 再找出好的呈現方式

cc1 =  flights %>%
  group_by(tailnum) %>%
  summarise( count = n(),                          
             dist = mean(distance, na.rm = T),
             delay = mean(arr_delay, na.rm = T)) %>%
  filter(count > 20, dist < 2000)

ggplot(cc1, aes(x = dist , y = delay)) +
  geom_point(aes(size = count), 
             alpha = 0.8, 
             color = "pink") +
  labs(title = "飛行距離與延遲時間的關係",
       x = "飛行距離",
       y = "延遲時間(分鐘)",
       size = "飛行次數") +
  theme_classic() 
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).