library(Lahman)
Warning: package 'Lahman' was built under R version 3.4.3
library(dplyr)

1. 請計算每位選手的生涯打擊資料。哪位選手被三振(SO)的次數最多?被三振了幾次?在出場數(G)低於1000 場的選手中,哪位選手的全壘打數(HR)最多?共打了幾支全壘打?

dta_1 <- tbl_df(Batting) %>% select(playerID, SO, G, HR) %>%
  group_by(playerID) %>%
  summarise_all(.funs = funs(sum)) %>%
  arrange(desc(SO))
dta_1
# A tibble: 18,915 x 4
    playerID    SO     G    HR
       <chr> <int> <int> <int>
 1 jacksre01  2597  2820   563
 2 thomeji01  2548  2543   612
 3  dunnad01  2379  2001   462
 4  sosasa01  2306  2354   609
 5 rodrial01  2287  2784   696
 6 galaran01  2003  2257   399
 7 cansejo01  1942  1887   462
 8 stargwi01  1936  2360   475
 9 camermi01  1901  1955   278
10 schmimi01  1883  2404   548
# ... with 18,905 more rows

jacksre01被三振的次數最多,共有2597次。

dta_1 %>% filter(G <1000)  %>%
  arrange(desc(HR))
# A tibble: 17,350 x 4
    playerID    SO     G    HR
       <chr> <int> <int> <int>
 1 stantmi03   977   827   208
 2 balbost01   856   960   181
 3 gentiji01   663   936   179
 4 trumbma01   856   849   178
 5 kittlro01   744   843   176
 6 hidalri01   737   987   171
 7 troutmi01   784   811   168
 8 conigto01   629   876   166
 9 rodrihe02   803   950   160
10 quentca01   516   834   154
# ... with 17,340 more rows
dta_1
# A tibble: 18,915 x 4
    playerID    SO     G    HR
       <chr> <int> <int> <int>
 1 jacksre01  2597  2820   563
 2 thomeji01  2548  2543   612
 3  dunnad01  2379  2001   462
 4  sosasa01  2306  2354   609
 5 rodrial01  2287  2784   696
 6 galaran01  2003  2257   399
 7 cansejo01  1942  1887   462
 8 stargwi01  1936  2360   475
 9 camermi01  1901  1955   278
10 schmimi01  1883  2404   548
# ... with 18,905 more rows

出場數(G)低於1000場的選手中,stantmi03的全壘打數(HR)最多,共有563次。

2. 哪位選手其第一次出場比賽(debut)與最後一次出場比賽(finalGame)的間隔時間最久?共隔了幾天?

dta_2 <- tbl_df(Master) %>% select(playerID, debut, finalGame) %>%
  group_by(playerID) %>%
  mutate(period = as.Date(finalGame) - as.Date(debut)) %>%
  arrange(desc(period))
dta_2
# A tibble: 19,105 x 4
# Groups:   playerID [19,105]
    playerID      debut  finalGame     period
       <chr>      <chr>      <chr>     <time>
 1 altroni01 1898-07-14 1933-10-01 12862 days
 2 orourji01 1872-04-26 1904-09-22 11836 days
 3 minosmi01 1949-04-19 1980-10-05 11492 days
 4 olearch01 1904-04-14 1934-09-30 11126 days
 5 lathaar01 1880-07-05 1909-09-30 10678 days
 6 mcguide01 1884-06-21 1912-05-18 10192 days
 7 jennihu01 1891-06-01 1918-09-02  9954 days
 8 eversjo01 1902-09-01 1929-10-06  9897 days
 9  ryanno01 1966-09-11 1993-09-22  9873 days
10 streega01 1904-09-13 1931-09-20  9868 days
# ... with 19,095 more rows

選手altroni01第一次出場比賽(debut)與最後一次出場比賽(finalGame)間隔時間最久,共隔了12862天。

3. 請計算2015 年每個球隊的平均薪資,以及2014 年每個球隊的打擊資料(加總),並將兩組資料根據teamID 合併。

dta_s <- tbl_df(Salaries) %>% select(teamID, yearID, salary) %>%
  filter(yearID == 2015) %>%
  group_by(teamID) %>%
  summarise(m_salary = mean(salary)) %>%
  mutate(msalary_k = m_salary/1000)
dta_s$teamID <- as.character(dta_s$teamID)
str(dta_s)
Classes 'tbl_df', 'tbl' and 'data.frame':   30 obs. of  3 variables:
 $ teamID   : chr  "ARI" "ATL" "BAL" "BOS" ...
 $ m_salary : num  2132207 2990885 4108744 5659481 4161989 ...
 $ msalary_k: num  2132 2991 4109 5659 4162 ...
dta_b <- tbl_df(Batting) %>% select(-c(playerID, lgID, stint)) %>%
  filter(yearID == 2014) %>%
  group_by(teamID) %>%
  summarise_all(.funs = funs(sum))
dta_b$teamID <- as.character(dta_b$teamID)
str(dta_b)
Classes 'tbl_df', 'tbl' and 'data.frame':   30 obs. of  19 variables:
 $ teamID: chr  "ARI" "ATL" "BAL" "BOS" ...
 $ yearID: int  104728 78546 88616 110770 88616 96672 90630 92644 98686 96672 ...
 $ G     : int  2274 2252 2294 2283 2285 2358 2213 2391 2387 2330 ...
 $ AB    : int  5552 5468 5596 5551 5543 5508 5395 5575 5612 5630 ...
 $ R     : int  615 573 705 634 660 614 595 669 755 757 ...
 $ H     : int  1379 1316 1434 1355 1400 1315 1282 1411 1551 1557 ...
 $ X2B   : int  259 240 264 282 279 270 254 284 307 325 ...
 $ X3B   : int  47 22 16 20 32 31 20 23 41 26 ...
 $ HR    : int  118 123 211 123 155 157 131 142 186 155 ...
 $ RBI   : int  573 545 681 601 625 590 562 644 721 731 ...
 $ SB    : int  86 95 44 63 85 65 122 104 85 106 ...
 $ CS    : int  33 33 20 25 36 40 52 27 48 41 ...
 $ BB    : int  398 472 401 535 417 442 415 504 397 443 ...
 $ SO    : int  1165 1369 1285 1337 1362 1477 1252 1189 1281 1144 ...
 $ IBB   : int  31 31 29 36 33 29 22 24 39 51 ...
 $ HBP   : int  43 43 62 68 60 54 52 42 48 44 ...
 $ SH    : int  56 53 35 20 19 57 76 51 59 24 ...
 $ SF    : int  36 27 36 52 38 41 37 49 48 61 ...
 $ GIDP  : int  115 121 112 138 127 94 88 126 121 137 ...
dta_3 <- inner_join(dta_s, dta_b, by = "teamID")
dta_3
# A tibble: 30 x 21
   teamID m_salary msalary_k yearID     G    AB     R     H   X2B   X3B
    <chr>    <dbl>     <dbl>  <int> <int> <int> <int> <int> <int> <int>
 1    ARI  2132207  2132.207 104728  2274  5552   615  1379   259    47
 2    ATL  2990885  2990.885  78546  2252  5468   573  1316   240    22
 3    BAL  4108744  4108.744  88616  2294  5596   705  1434   264    16
 4    BOS  5659481  5659.481 110770  2283  5551   634  1355   282    20
 5    CHA  4161989  4161.989  88616  2285  5543   660  1400   279    32
 6    CHN  4138547  4138.547  96672  2358  5508   614  1315   270    31
 7    CIN  4187862  4187.862  90630  2213  5395   595  1282   254    20
 8    CLE  3022888  3022.888  92644  2391  5575   669  1411   284    23
 9    COL  3827544  3827.544  98686  2387  5612   755  1551   307    41
10    DET  6891390  6891.390  96672  2330  5630   757  1557   325    26
# ... with 20 more rows, and 11 more variables: HR <int>, RBI <int>,
#   SB <int>, CS <int>, BB <int>, SO <int>, IBB <int>, HBP <int>,
#   SH <int>, SF <int>, GIDP <int>

4. 根據3 所得到的資料,使用ggpairs 繪製各球隊平均薪資與安打數(H)、全壘打數(HR)、打點數(RBI)、盜壘數(SB)、三振次數(SO)總數的散布圖。

library(GGally)
Warning: package 'GGally' was built under R version 3.4.3

Attaching package: 'GGally'
The following object is masked from 'package:dplyr':

    nasa
dta_3 %>% ggpairs(columns = c("H", "HR", "RBI", "SB", "SO", "msalary_k"),
                  lower = list(continuous = "smooth"))