1.

Primary keys for Master, Batting, Pitching, and Fielding.

nrow(Master)
## [1] 20093
count(count(Master, playerID))[[1]]
## [1] 20093

Primary key for Master: playerID

nrow(Batting)
## [1] 108789
group_by(Batting, playerID, yearID) %>% 
  nrow()
## [1] 108789

Primary key for Batting: playerID and yearID together.

nrow(Pitching)
## [1] 48399
group_by(Pitching, playerID, yearID) %>% 
  nrow()
## [1] 48399

Primary key for Pitching: playerID, and yearID together.

nrow(Fielding)
## [1] 144768
group_by(Fielding, playerID, yearID) %>% 
  nrow()
## [1] 144768

Primary key for Fielding: playerID, and yearID together.

2.

Cacreer statistics for each player with Batting.

Batting2 <- Batting %>%
  group_by(playerID) %>% 
  mutate(G = sum(G, na.rm = TRUE), AB = sum(AB, na.rm = TRUE), 
         R = sum(R, na.rm = TRUE), H = sum(H, na.rm = TRUE), 
         X2B = sum(X2B, na.rm = TRUE), X3B = sum(X3B, na.rm = TRUE),
         HR = sum(HR, na.rm = TRUE), RBI = sum(RBI, na.rm = TRUE), 
         SB = sum(SB, na.rm = TRUE), CS = sum(CS, na.rm = TRUE), 
         BB = sum(BB, na.rm = TRUE), SO = sum(SO, na.rm = TRUE), 
         IBB = sum(IBB, na.rm = TRUE), HBP = sum(HBP, na.rm = TRUE), 
         SH = sum(SH, na.rm = TRUE), SF = sum(SF, na.rm = TRUE),
         GIDP = sum(GIDP, na.rm = TRUE))
as_tibble(Batting2)
## # A tibble: 108,789 x 22
##    playerID yearID stint teamID lgID      G    AB     R     H   X2B   X3B    HR
##    <chr>     <int> <int> <fct>  <fct> <int> <int> <int> <int> <int> <int> <int>
##  1 abercda~   1871     1 TRO    NA        1     4     0     0     0     0     0
##  2 addybo01   1871     1 RC1    NA      274  1231   227   341    36    13     1
##  3 allisar~   1871     1 CL1    NA      168   741   106   188    19     8     1
##  4 allisdo~   1871     1 WS3    NA      318  1407   236   382    44    10     2
##  5 ansonca~   1871     1 RC1    NA     2524 10281  1999  3435   582   142    97
##  6 armstbo~   1871     1 FW1    NA       12    49     9    11     2     1     0
##  7 barkeal~   1871     1 RC1    NA        1     4     0     1     0     0     0
##  8 barnero~   1871     1 BS1    NA      499  2391   698   860   146    47     6
##  9 barrebi~   1871     1 FW1    NA        3    13     1     2     1     0     0
## 10 barrofr~   1871     1 BS1    NA       18    86    13    13     2     1     0
## # ... with 108,779 more rows, and 10 more variables: RBI <int>, SB <int>,
## #   CS <int>, BB <int>, SO <int>, IBB <int>, HBP <int>, SH <int>, SF <int>,
## #   GIDP <int>

Use mutate.

It was possible to make a function with loops to get sums for multiple variables, however since the time that it takes to run all the loops, it becomes inefficient (takes about 3 minutes). Below is the function that runs the loop.

mutatesum1 <- function(df,keyIndex, i, j) {
  #Arrange the dataframe with key first.
  #keyIndex: position of the key in column
  #i: position of the variable to start
  #j: position of the variable to finish
  varsum <- matrix(nrow = nrow(df), ncol = j-i+1)
  for (obs in 1:nrow(df)) {
    for (vars in i:j) {
      if (is.na(df[obs, vars])) {
        varsum[obs, vars-i+1] <- 0
      } 
      if (obs == 1) {
        varsum[obs, vars-i+1] <- df[obs, vars]
      } else if (df[obs, keyIndex] == df[obs-1, keyIndex]) {
        varsum[obs, vars-i+1] <- df[obs, vars] + varsum[obs-1, vars-i+1]
        l <- 1
        while (df[obs, keyIndex] == df[obs-l, keyIndex]) {
          varsum[obs-l, vars-i+1] <- varsum[obs, vars-i+1]
          l <- l+1
          if (obs-l == 0){
            break
          }
        }
      } else {
        varsum[obs, vars-i+1] <- df[obs, vars]
      }
    }
  }
  df[, i:j] <- varsum
}

3.

Do (2) with Pitching.

Pitching2 <- Pitching %>%
  group_by(playerID) %>% 
  mutate(W = sum(W, na.rm = TRUE), L = sum(L, na.rm = TRUE), 
         G = sum(G, na.rm = TRUE), GS = sum(GS, na.rm = TRUE), 
         CG = sum(CG, na.rm = TRUE), SHO = sum(SHO, na.rm = TRUE),
         SV = sum(SV, na.rm = TRUE), IPouts = sum(IPouts, na.rm = TRUE), 
         H = sum(H, na.rm = TRUE), ER = sum(ER, na.rm = TRUE), 
         HR = sum(HR, na.rm = TRUE), BB = sum(BB, na.rm = TRUE), 
         SO = sum(SO, na.rm = TRUE), IBB = sum(IBB, na.rm = TRUE),
         WP = sum(WP, na.rm = TRUE), HBP = sum(HBP, na.rm = TRUE),
         BK = sum(BK, na.rm = TRUE), BFP = sum(BFP, na.rm = TRUE),
         GF = sum(GF, na.rm = TRUE), R = sum(R, na.rm = TRUE),
         SH = sum(SH, na.rm = TRUE), SF = sum(SF, na.rm = TRUE),
         GIDP = sum(GIDP, na.rm = TRUE))
as_tibble(Pitching2)
## # A tibble: 48,399 x 30
##    playerID yearID stint teamID lgID      W     L     G    GS    CG   SHO    SV
##    <chr>     <int> <int> <fct>  <fct> <int> <int> <int> <int> <int> <int> <int>
##  1 bechtge~   1871     1 PH1    NA        7    20    30    27    25     0     0
##  2 brainas~   1871     1 WS3    NA       24    53    85    82    77     0     0
##  3 fergubo~   1871     1 NY2    NA        1     3    11     4     4     0     0
##  4 fishech~   1871     1 RC1    NA       56    84   165   141   126     4     3
##  5 fleetfr~   1871     1 NY2    NA        2     6     9     8     7     0     0
##  6 flowedi~   1871     1 TRO    NA        0     0     1     0     0     0     0
##  7 mackde01   1871     1 RC1    NA        0     1     3     1     1     0     0
##  8 mathebo~   1871     1 FW1    NA      297   248   578   568   525    20     3
##  9 mcbridi~   1871     1 PH1    NA      149    78   237   237   227    10     0
## 10 mcmuljo~   1871     1 TRO    NA       14    15    37    31    30     0     1
## # ... with 48,389 more rows, and 18 more variables: IPouts <int>, H <int>,
## #   ER <int>, HR <int>, BB <int>, SO <int>, BAOpp <dbl>, ERA <dbl>, IBB <int>,
## #   WP <int>, HBP <int>, BK <int>, BFP <int>, GF <int>, R <int>, SH <int>,
## #   SF <int>, GIDP <int>

4.

Batting4 with statistics and players’ info.

Batting3 <- as_tibble(battingStats(Batting2))

#selects variables we need from `Master` data.
BattingInfo <- Master %>%
  select(playerID, starts_with("name"), bats, throws)

#join the data on the right side
Batting4 <- Batting3 %>% 
  left_join(Master, by = "playerID")

as_tibble(select(Batting4, playerID, starts_with("name"), bats, throws, everything()))
## # A tibble: 108,789 x 54
##    playerID nameFirst nameLast nameGiven bats  throws yearID stint teamID lgID 
##    <chr>    <chr>     <chr>    <chr>     <fct> <fct>   <int> <int> <fct>  <fct>
##  1 abercda~ Frank     Abercro~ Francis ~ <NA>  <NA>     1871     1 TRO    NA   
##  2 addybo01 Bob       Addy     Robert E~ L     L        1871     1 RC1    NA   
##  3 allisar~ Art       Allison  Arthur A~ <NA>  <NA>     1871     1 CL1    NA   
##  4 allisdo~ Doug      Allison  Douglas ~ R     R        1871     1 WS3    NA   
##  5 ansonca~ Cap       Anson    Adrian C~ R     R        1871     1 RC1    NA   
##  6 armstbo~ Robert    Armstro~ Robert L~ <NA>  <NA>     1871     1 FW1    NA   
##  7 barkeal~ Al        Barker   Alfred L. <NA>  <NA>     1871     1 RC1    NA   
##  8 barnero~ Ross      Barnes   Charles ~ R     R        1871     1 BS1    NA   
##  9 barrebi~ Bill      Barrett  William   <NA>  <NA>     1871     1 FW1    NA   
## 10 barrofr~ Frank     Barrows  Franklin~ <NA>  <NA>     1871     1 BS1    NA   
## # ... with 108,779 more rows, and 44 more variables: G <int>, AB <int>,
## #   R <int>, H <int>, X2B <int>, X3B <int>, HR <int>, RBI <int>, SB <int>,
## #   CS <int>, BB <int>, SO <int>, IBB <int>, HBP <int>, SH <int>, SF <int>,
## #   GIDP <int>, BA <dbl>, PA <dbl>, TB <dbl>, SlugPct <dbl>, OBP <dbl>,
## #   OPS <dbl>, BABIP <dbl>, birthYear <int>, birthMonth <int>, birthDay <int>,
## #   birthCountry <chr>, birthState <chr>, birthCity <chr>, deathYear <int>,
## #   deathMonth <int>, deathDay <int>, deathCountry <chr>, deathState <chr>,
## #   deathCity <chr>, weight <int>, height <int>, debut <chr>, finalGame <chr>,
## #   retroID <chr>, bbrefID <chr>, deathDate <date>, birthDate <date>

5.

Combine Batting4 and Pitching2.

BatPitch1 <- Batting4 %>% 
  filter(PA > 0) %>% 
  semi_join(Pitching2, by = "playerID") %>% 
  left_join(Pitching2, by = "playerID")
BatPitch1
## # A tibble: 438,708 x 83
##    playerID yearID.x stint.x teamID.x lgID.x   G.x    AB   R.x   H.x   X2B   X3B
##    <chr>       <int>   <int> <fct>    <fct>  <int> <int> <int> <int> <int> <int>
##  1 allisdo~     1871       1 WS3      NA       318  1407   236   382    44    10
##  2 ansonca~     1871       1 RC1      NA      2524 10281  1999  3435   582   142
##  3 ansonca~     1871       1 RC1      NA      2524 10281  1999  3435   582   142
##  4 barnero~     1871       1 BS1      NA       499  2391   698   860   146    47
##  5 battijo~     1871       1 CL1      NA       480  1953   228   439    51    25
##  6 battijo~     1871       1 CL1      NA       480  1953   228   439    51    25
##  7 bechtge~     1871       1 PH1      NA       221  1040   216   288    48    12
##  8 bechtge~     1871       1 PH1      NA       221  1040   216   288    48    12
##  9 bechtge~     1871       1 PH1      NA       221  1040   216   288    48    12
## 10 bechtge~     1871       1 PH1      NA       221  1040   216   288    48    12
## # ... with 438,698 more rows, and 72 more variables: HR.x <int>, RBI <int>,
## #   SB <int>, CS <int>, BB.x <int>, SO.x <int>, IBB.x <int>, HBP.x <int>,
## #   SH.x <int>, SF.x <int>, GIDP.x <int>, BA <dbl>, PA <dbl>, TB <dbl>,
## #   SlugPct <dbl>, OBP <dbl>, OPS <dbl>, BABIP <dbl>, birthYear <int>,
## #   birthMonth <int>, birthDay <int>, birthCountry <chr>, birthState <chr>,
## #   birthCity <chr>, deathYear <int>, deathMonth <int>, deathDay <int>,
## #   deathCountry <chr>, deathState <chr>, deathCity <chr>, nameFirst <chr>,
## #   nameLast <chr>, nameGiven <chr>, weight <int>, height <int>, bats <fct>,
## #   throws <fct>, debut <chr>, finalGame <chr>, retroID <chr>, bbrefID <chr>,
## #   deathDate <date>, birthDate <date>, yearID.y <int>, stint.y <int>,
## #   teamID.y <fct>, lgID.y <fct>, W <int>, L <int>, G.y <int>, GS <int>,
## #   CG <int>, SHO <int>, SV <int>, IPouts <int>, H.y <int>, ER <int>,
## #   HR.y <int>, BB.y <int>, SO.y <int>, BAOpp <dbl>, ERA <dbl>, IBB.y <int>,
## #   WP <int>, HBP.y <int>, BK <int>, BFP <int>, GF <int>, R.y <int>,
## #   SH.y <int>, SF.y <int>, GIDP.y <int>

This table has some problem. Some varibles are shared in both dataframe, so it creates duplicated varaibles. So, let’s filter out those variables, then join them.

# Defind a function that finds a vector of varaible names that are in both data frame.
dupVar <- function(x,y, from) {
  dupVar <- vector()
  count <- 1
  for (i in from:ncol(x)) {
    if (names(x[i]) %in% variable.names(y)){
      dupVar[count] <- names(x[i])
      count <- 1+count
    }
  }
  return(dupVar)
}

# Remove the variable (By start from 2, the first variable `playerID` is kept).
PitchRM1 <- Pitching2 %>% 
  select(-c(dupVar(Pitching2, Batting4,2)))

# Join
BatPitch1 <- Batting4 %>% 
  filter(PA > 0) %>% 
  semi_join(PitchRM1, by = "playerID") %>% 
  left_join(PitchRM1, by = "playerID")

BatPitch1
## # A tibble: 438,708 x 68
##    playerID yearID stint teamID lgID      G    AB     R     H   X2B   X3B    HR
##    <chr>     <int> <int> <fct>  <fct> <int> <int> <int> <int> <int> <int> <int>
##  1 allisdo~   1871     1 WS3    NA      318  1407   236   382    44    10     2
##  2 ansonca~   1871     1 RC1    NA     2524 10281  1999  3435   582   142    97
##  3 ansonca~   1871     1 RC1    NA     2524 10281  1999  3435   582   142    97
##  4 barnero~   1871     1 BS1    NA      499  2391   698   860   146    47     6
##  5 battijo~   1871     1 CL1    NA      480  1953   228   439    51    25     3
##  6 battijo~   1871     1 CL1    NA      480  1953   228   439    51    25     3
##  7 bechtge~   1871     1 PH1    NA      221  1040   216   288    48    12     3
##  8 bechtge~   1871     1 PH1    NA      221  1040   216   288    48    12     3
##  9 bechtge~   1871     1 PH1    NA      221  1040   216   288    48    12     3
## 10 bechtge~   1871     1 PH1    NA      221  1040   216   288    48    12     3
## # ... with 438,698 more rows, and 56 more variables: RBI <int>, SB <int>,
## #   CS <int>, BB <int>, SO <int>, IBB <int>, HBP <int>, SH <int>, SF <int>,
## #   GIDP <int>, BA <dbl>, PA <dbl>, TB <dbl>, SlugPct <dbl>, OBP <dbl>,
## #   OPS <dbl>, BABIP <dbl>, birthYear <int>, birthMonth <int>, birthDay <int>,
## #   birthCountry <chr>, birthState <chr>, birthCity <chr>, deathYear <int>,
## #   deathMonth <int>, deathDay <int>, deathCountry <chr>, deathState <chr>,
## #   deathCity <chr>, nameFirst <chr>, nameLast <chr>, nameGiven <chr>,
## #   weight <int>, height <int>, bats <fct>, throws <fct>, debut <chr>,
## #   finalGame <chr>, retroID <chr>, bbrefID <chr>, deathDate <date>,
## #   birthDate <date>, W <int>, L <int>, GS <int>, CG <int>, SHO <int>,
## #   SV <int>, IPouts <int>, ER <int>, BAOpp <dbl>, ERA <dbl>, WP <int>,
## #   BK <int>, BFP <int>, GF <int>

6.

Write a function HR_rate that calculates HR/G and returns the original table with sorted by new variable.

HR_rate <- function(df) {
  df2 <- df %>% 
    mutate(HR_G = HR / G) %>% 
    select(playerID:lgID, HR_G, everything()) %>% 
    arrange(desc(HR_G))
  return(df2)
}

as_tibble(HR_rate(Batting))
## # A tibble: 108,789 x 23
##    playerID yearID stint teamID lgID   HR_G     G    AB     R     H   X2B   X3B
##    <chr>     <int> <int> <fct>  <fct> <dbl> <int> <int> <int> <int> <int> <int>
##  1 lefebbi~   1938     1 BOS    AL    1         1     1     1     1     0     0
##  2 quirkja~   1984     2 CLE    AL    1         1     1     1     1     0     0
##  3 goffje01   1996     1 HOU    NL    1         1     4     1     2     0     0
##  4 marsolo~   2008     1 PHI    NL    1         1     4     2     2     0     0
##  5 hillst01   2010     1 SLN    NL    1         1     3     1     1     0     0
##  6 campulu~   2020     1 SDN    NL    1         1     3     2     1     0     0
##  7 kniceal~   1981     1 HOU    NL    0.667     3     7     2     4     0     0
##  8 mancitr~   2016     1 BAL    AL    0.6       5    14     3     5     1     0
##  9 clarkda~   1888     1 CHN    NL    0.5       2     7     4     2     0     1
## 10 reillch~   1889     1 CL6    AA    0.5       6    23     5    11     1     0
## # ... with 108,779 more rows, and 11 more variables: HR <int>, RBI <int>,
## #   SB <int>, CS <int>, BB <int>, SO <int>, IBB <int>, HBP <int>, SH <int>,
## #   SF <int>, GIDP <int>

In order to keep the Batting as it as, we will return a new table with the new variable, instead of overriding the original table, since it gets used later too.

7.

Create Batting5 by relevel lgID of Batting as AL, NL, and Other.

First, let’s check the levels of lgID.

levels(Batting$lgID)
## [1] "AA" "AL" "FL" "NA" "NL" "PL" "UA"

There are AA, FL, NA, PL, and UA that are not AL or NL. Therefore, let’s collapse them into the Other level.

Batting5 <- Batting %>%
  mutate(lgID = fct_collapse(lgID,
                             AL = "AL",
                             NL = "NL",
                             Other = c("AA", "FL", "NA", "PL", "UA")))
as_tibble(Batting5)
## # A tibble: 108,789 x 22
##    playerID yearID stint teamID lgID      G    AB     R     H   X2B   X3B    HR
##    <chr>     <int> <int> <fct>  <fct> <int> <int> <int> <int> <int> <int> <int>
##  1 abercda~   1871     1 TRO    Other     1     4     0     0     0     0     0
##  2 addybo01   1871     1 RC1    Other    25   118    30    32     6     0     0
##  3 allisar~   1871     1 CL1    Other    29   137    28    40     4     5     0
##  4 allisdo~   1871     1 WS3    Other    27   133    28    44    10     2     2
##  5 ansonca~   1871     1 RC1    Other    25   120    29    39    11     3     0
##  6 armstbo~   1871     1 FW1    Other    12    49     9    11     2     1     0
##  7 barkeal~   1871     1 RC1    Other     1     4     0     1     0     0     0
##  8 barnero~   1871     1 BS1    Other    31   157    66    63    10     9     0
##  9 barrebi~   1871     1 FW1    Other     1     5     1     1     1     0     0
## 10 barrofr~   1871     1 BS1    Other    18    86    13    13     2     1     0
## # ... with 108,779 more rows, and 10 more variables: RBI <int>, SB <int>,
## #   CS <int>, BB <int>, SO <int>, IBB <int>, HBP <int>, SH <int>, SF <int>,
## #   GIDP <int>

8.

Create integer variables debutYear, debutMonth, and debutDay from Master$debut. Then, create debutDate and debutAge into Master2.

Master2 <- Master %>% 
  mutate(debutYear = as.integer(str_sub(debut, 1, 4)), 
         debutMonth = as.integer(str_sub(debut, 6, 7)),
         debutDay = as.integer(str_sub(debut, 9, 10)),
         debutDate = make_date(debutYear, debutMonth, debutDay), 
         birthDate = ymd(birthDate),
         debutAge = as.period(interval(birthDate, debutDate), unit = "year")$year)

as_tibble(select(Master2, playerID, debutDate, debutAge, everything()))
## # A tibble: 20,093 x 31
##    playerID debutDate  debutAge birthYear birthMonth birthDay birthCountry
##    <chr>    <date>        <dbl>     <int>      <int>    <int> <chr>       
##  1 aardsda~ 2004-04-06       22      1981         12       27 USA         
##  2 aaronha~ 1954-04-13       20      1934          2        5 USA         
##  3 aaronto~ 1962-04-10       22      1939          8        5 USA         
##  4 aasedo01 1977-07-26       22      1954          9        8 USA         
##  5 abadan01 2001-09-10       29      1972          8       25 USA         
##  6 abadfe01 2010-07-28       24      1985         12       17 D.R.        
##  7 abadijo~ 1875-04-26       24      1850         11        4 USA         
##  8 abbated~ 1897-09-04       20      1877          4       15 USA         
##  9 abbeybe~ 1892-06-14       22      1869         11       11 USA         
## 10 abbeych~ 1893-08-16       26      1866         10       14 USA         
## # ... with 20,083 more rows, and 24 more variables: birthState <chr>,
## #   birthCity <chr>, deathYear <int>, deathMonth <int>, deathDay <int>,
## #   deathCountry <chr>, deathState <chr>, deathCity <chr>, nameFirst <chr>,
## #   nameLast <chr>, nameGiven <chr>, weight <int>, height <int>, bats <fct>,
## #   throws <fct>, debut <chr>, finalGame <chr>, retroID <chr>, bbrefID <chr>,
## #   deathDate <date>, birthDate <date>, debutYear <int>, debutMonth <int>,
## #   debutDay <int>

9.

Use Master$birthCountry to create a factor variable called birthContinent that gives the continent of each player’s birth.

Before creating a factor, let’s check the levels of the variable.

unique(Master2$birthCountry)
##  [1] "USA"            "D.R."           "Venezuela"      "Cuba"          
##  [5] "Mexico"         "Panama"         "CAN"            "P.R."          
##  [9] "Russia"         "Japan"          "Curacao"        "Colombia"      
## [13] "Nicaragua"      "Germany"        "Norway"         "Ireland"       
## [17] "Italy"          "Bahamas"        "United Kingdom" "South Korea"   
## [21] "Australia"      "Czech Republic" "V.I."           "Netherlands"   
## [25] "France"         "Aruba"          NA               "Sweden"        
## [29] "Hong Kong"      "Afghanistan"    "Spain"          "Greece"        
## [33] "Taiwan"         "Philippines"    "Jamaica"        "Poland"        
## [37] "Honduras"       "Brazil"         "Viet Nam"       "Guam"          
## [41] "Denmark"        "Switzerland"    "Austria"        "Singapore"     
## [45] "China"          "Belgium"        "Peru"           "Belize"        
## [49] "Indonesia"      "Finland"        "Lithuania"      "South Africa"  
## [53] "At Sea"         "Slovakia"       "American Samoa" "Saudi Arabia"  
## [57] "Portugal"       "Latvia"

Now, let’s convert the variable as a factor and define the levels.

Master2 <- Master2 %>% 
  mutate(birthCountry = factor(birthCountry, levels = unique(birthCountry))) %>% 
  mutate(birthContinent = fct_collapse(birthCountry, 
                                       `North America` = c("USA", "CAN"),
                                       `Latin America` = c("D.R.", "Venezuela",
                                                           "Cuba", "Mexico",
                                                           "Colombia", "Nicaragua", 
                                                           "Bahamas", "P.R.", "Curacao", 
                                                           "Aruba", "Jamaica", "Brazil", 
                                                           "Belize", "Honduras", 
                                                           "American Samoa", "V.I."),
                                       `Europe` = c("Germany", "Norway", "Czech Republic",
                                                    "Netherlands", "Ireland", "Italy",
                                                    "United Kingdom", "France", "Sweden", 
                                                    "Spain", "Greece", "Poland", "Denmark", 
                                                    "Switzerland", "Austria", "Belgium", 
                                                    "Finland", "Lithuania", "Latvia", 
                                                    "Slovakia", "Portugal"),
                                       `Asia` = c("Russia", "Japan", "South Korea", 
                                                  "Hong Kong", "Afghanistan", "Taiwan",
                                                  "Philippines", "Viet Nam", "China", 
                                                  "Indonesia", "Saudi Arabia"),
                                       `Oceania` = c("Australia", "Guam", "Singapore"),
                                       `Afrika` = c("South Africa"),
                                       `Other` = c("At Sea")))
as_tibble(select(Master2, playerID, birthCountry, birthContinent, debutAge, debutDate, everything()))
## # A tibble: 20,093 x 32
##    playerID birthCountry birthContinent debutAge debutDate  birthYear birthMonth
##    <chr>    <fct>        <fct>             <dbl> <date>         <int>      <int>
##  1 aardsda~ USA          North America        22 2004-04-06      1981         12
##  2 aaronha~ USA          North America        20 1954-04-13      1934          2
##  3 aaronto~ USA          North America        22 1962-04-10      1939          8
##  4 aasedo01 USA          North America        22 1977-07-26      1954          9
##  5 abadan01 USA          North America        29 2001-09-10      1972          8
##  6 abadfe01 D.R.         Latin America        24 2010-07-28      1985         12
##  7 abadijo~ USA          North America        24 1875-04-26      1850         11
##  8 abbated~ USA          North America        20 1897-09-04      1877          4
##  9 abbeybe~ USA          North America        22 1892-06-14      1869         11
## 10 abbeych~ USA          North America        26 1893-08-16      1866         10
## # ... with 20,083 more rows, and 25 more variables: birthDay <int>,
## #   birthState <chr>, birthCity <chr>, deathYear <int>, deathMonth <int>,
## #   deathDay <int>, deathCountry <chr>, deathState <chr>, deathCity <chr>,
## #   nameFirst <chr>, nameLast <chr>, nameGiven <chr>, weight <int>,
## #   height <int>, bats <fct>, throws <fct>, debut <chr>, finalGame <chr>,
## #   retroID <chr>, bbrefID <chr>, deathDate <date>, birthDate <date>,
## #   debutYear <int>, debutMonth <int>, debutDay <int>

10.

Master2 with checking difference in bats and throws.

SameDiff <- function(df) {
  Same_Dif <- character(nrow(df))
  for (i in 1:nrow(df)) {
    if (is.na(as.character(df$bats[i]) == as.character(df$throws[i]))){
      Same_Dif[i] <- NA
    } else if (as.character(df$bats[i]) == as.character(df$throws[i])) {
      Same_Dif[i] <-  "Same"
    } else {
      Same_Dif[i] <-  "Different"
    }
  }
  df2 <- df %>% 
    mutate(Same_Dif = Same_Dif)
  return(df2)
}

as_tibble(select(SameDiff(Master2), playerID, Same_Dif, bats, throws, birthCountry, birthContinent, debutAge, debutDate, everything()))
## # A tibble: 20,093 x 33
##    playerID Same_Dif bats  throws birthCountry birthContinent debutAge
##    <chr>    <chr>    <fct> <fct>  <fct>        <fct>             <dbl>
##  1 aardsda~ Same     R     R      USA          North America        22
##  2 aaronha~ Same     R     R      USA          North America        20
##  3 aaronto~ Same     R     R      USA          North America        22
##  4 aasedo01 Same     R     R      USA          North America        22
##  5 abadan01 Same     L     L      USA          North America        29
##  6 abadfe01 Same     L     L      D.R.         Latin America        24
##  7 abadijo~ Same     R     R      USA          North America        24
##  8 abbated~ Same     R     R      USA          North America        20
##  9 abbeybe~ Same     R     R      USA          North America        22
## 10 abbeych~ Same     L     L      USA          North America        26
## # ... with 20,083 more rows, and 26 more variables: debutDate <date>,
## #   birthYear <int>, birthMonth <int>, birthDay <int>, birthState <chr>,
## #   birthCity <chr>, deathYear <int>, deathMonth <int>, deathDay <int>,
## #   deathCountry <chr>, deathState <chr>, deathCity <chr>, nameFirst <chr>,
## #   nameLast <chr>, nameGiven <chr>, weight <int>, height <int>, debut <chr>,
## #   finalGame <chr>, retroID <chr>, bbrefID <chr>, deathDate <date>,
## #   birthDate <date>, debutYear <int>, debutMonth <int>, debutDay <int>

11.

Same as (10), but only with one of the map* functions.

SameDiffFnc <- function(x, y) {
  if (is.na(x==y)) {
    return(NA)
  } else if (x == y) {
    return("Same")
  } else {
    return("Different")
  }
}

SameDiffDf <- function(df) {
  Same_Dif_map <- map2_chr(as.character(df$bats), as.character(df$throws), SameDiffFnc)
  df2 <- df %>% 
    mutate(Same_Dif_map = Same_Dif_map)
  return(df2)
}

as_tibble(select(SameDiffDf(Master2), playerID, Same_Dif_map, bats, throws, birthCountry, birthContinent, debutAge, debutDate, everything()))
## # A tibble: 20,093 x 33
##    playerID Same_Dif_map bats  throws birthCountry birthContinent debutAge
##    <chr>    <chr>        <fct> <fct>  <fct>        <fct>             <dbl>
##  1 aardsda~ Same         R     R      USA          North America        22
##  2 aaronha~ Same         R     R      USA          North America        20
##  3 aaronto~ Same         R     R      USA          North America        22
##  4 aasedo01 Same         R     R      USA          North America        22
##  5 abadan01 Same         L     L      USA          North America        29
##  6 abadfe01 Same         L     L      D.R.         Latin America        24
##  7 abadijo~ Same         R     R      USA          North America        24
##  8 abbated~ Same         R     R      USA          North America        20
##  9 abbeybe~ Same         R     R      USA          North America        22
## 10 abbeych~ Same         L     L      USA          North America        26
## # ... with 20,083 more rows, and 26 more variables: debutDate <date>,
## #   birthYear <int>, birthMonth <int>, birthDay <int>, birthState <chr>,
## #   birthCity <chr>, deathYear <int>, deathMonth <int>, deathDay <int>,
## #   deathCountry <chr>, deathState <chr>, deathCity <chr>, nameFirst <chr>,
## #   nameLast <chr>, nameGiven <chr>, weight <int>, height <int>, debut <chr>,
## #   finalGame <chr>, retroID <chr>, bbrefID <chr>, deathDate <date>,
## #   birthDate <date>, debutYear <int>, debutMonth <int>, debutDay <int>