Combine Batting4 and Pitching2.
BatPitch1 <- Batting4 %>%
filter(PA > 0) %>%
semi_join(Pitching2, by = "playerID") %>%
left_join(Pitching2, by = "playerID")
BatPitch1
## # A tibble: 438,708 x 83
## playerID yearID.x stint.x teamID.x lgID.x G.x AB R.x H.x X2B X3B
## <chr> <int> <int> <fct> <fct> <int> <int> <int> <int> <int> <int>
## 1 allisdo~ 1871 1 WS3 NA 318 1407 236 382 44 10
## 2 ansonca~ 1871 1 RC1 NA 2524 10281 1999 3435 582 142
## 3 ansonca~ 1871 1 RC1 NA 2524 10281 1999 3435 582 142
## 4 barnero~ 1871 1 BS1 NA 499 2391 698 860 146 47
## 5 battijo~ 1871 1 CL1 NA 480 1953 228 439 51 25
## 6 battijo~ 1871 1 CL1 NA 480 1953 228 439 51 25
## 7 bechtge~ 1871 1 PH1 NA 221 1040 216 288 48 12
## 8 bechtge~ 1871 1 PH1 NA 221 1040 216 288 48 12
## 9 bechtge~ 1871 1 PH1 NA 221 1040 216 288 48 12
## 10 bechtge~ 1871 1 PH1 NA 221 1040 216 288 48 12
## # ... with 438,698 more rows, and 72 more variables: HR.x <int>, RBI <int>,
## # SB <int>, CS <int>, BB.x <int>, SO.x <int>, IBB.x <int>, HBP.x <int>,
## # SH.x <int>, SF.x <int>, GIDP.x <int>, BA <dbl>, PA <dbl>, TB <dbl>,
## # SlugPct <dbl>, OBP <dbl>, OPS <dbl>, BABIP <dbl>, birthYear <int>,
## # birthMonth <int>, birthDay <int>, birthCountry <chr>, birthState <chr>,
## # birthCity <chr>, deathYear <int>, deathMonth <int>, deathDay <int>,
## # deathCountry <chr>, deathState <chr>, deathCity <chr>, nameFirst <chr>,
## # nameLast <chr>, nameGiven <chr>, weight <int>, height <int>, bats <fct>,
## # throws <fct>, debut <chr>, finalGame <chr>, retroID <chr>, bbrefID <chr>,
## # deathDate <date>, birthDate <date>, yearID.y <int>, stint.y <int>,
## # teamID.y <fct>, lgID.y <fct>, W <int>, L <int>, G.y <int>, GS <int>,
## # CG <int>, SHO <int>, SV <int>, IPouts <int>, H.y <int>, ER <int>,
## # HR.y <int>, BB.y <int>, SO.y <int>, BAOpp <dbl>, ERA <dbl>, IBB.y <int>,
## # WP <int>, HBP.y <int>, BK <int>, BFP <int>, GF <int>, R.y <int>,
## # SH.y <int>, SF.y <int>, GIDP.y <int>
This table has some problem. Some varibles are shared in both dataframe, so it creates duplicated varaibles. So, let’s filter out those variables, then join them.
# Defind a function that finds a vector of varaible names that are in both data frame.
dupVar <- function(x,y, from) {
dupVar <- vector()
count <- 1
for (i in from:ncol(x)) {
if (names(x[i]) %in% variable.names(y)){
dupVar[count] <- names(x[i])
count <- 1+count
}
}
return(dupVar)
}
# Remove the variable (By start from 2, the first variable `playerID` is kept).
PitchRM1 <- Pitching2 %>%
select(-c(dupVar(Pitching2, Batting4,2)))
# Join
BatPitch1 <- Batting4 %>%
filter(PA > 0) %>%
semi_join(PitchRM1, by = "playerID") %>%
left_join(PitchRM1, by = "playerID")
BatPitch1
## # A tibble: 438,708 x 68
## playerID yearID stint teamID lgID G AB R H X2B X3B HR
## <chr> <int> <int> <fct> <fct> <int> <int> <int> <int> <int> <int> <int>
## 1 allisdo~ 1871 1 WS3 NA 318 1407 236 382 44 10 2
## 2 ansonca~ 1871 1 RC1 NA 2524 10281 1999 3435 582 142 97
## 3 ansonca~ 1871 1 RC1 NA 2524 10281 1999 3435 582 142 97
## 4 barnero~ 1871 1 BS1 NA 499 2391 698 860 146 47 6
## 5 battijo~ 1871 1 CL1 NA 480 1953 228 439 51 25 3
## 6 battijo~ 1871 1 CL1 NA 480 1953 228 439 51 25 3
## 7 bechtge~ 1871 1 PH1 NA 221 1040 216 288 48 12 3
## 8 bechtge~ 1871 1 PH1 NA 221 1040 216 288 48 12 3
## 9 bechtge~ 1871 1 PH1 NA 221 1040 216 288 48 12 3
## 10 bechtge~ 1871 1 PH1 NA 221 1040 216 288 48 12 3
## # ... with 438,698 more rows, and 56 more variables: RBI <int>, SB <int>,
## # CS <int>, BB <int>, SO <int>, IBB <int>, HBP <int>, SH <int>, SF <int>,
## # GIDP <int>, BA <dbl>, PA <dbl>, TB <dbl>, SlugPct <dbl>, OBP <dbl>,
## # OPS <dbl>, BABIP <dbl>, birthYear <int>, birthMonth <int>, birthDay <int>,
## # birthCountry <chr>, birthState <chr>, birthCity <chr>, deathYear <int>,
## # deathMonth <int>, deathDay <int>, deathCountry <chr>, deathState <chr>,
## # deathCity <chr>, nameFirst <chr>, nameLast <chr>, nameGiven <chr>,
## # weight <int>, height <int>, bats <fct>, throws <fct>, debut <chr>,
## # finalGame <chr>, retroID <chr>, bbrefID <chr>, deathDate <date>,
## # birthDate <date>, W <int>, L <int>, GS <int>, CG <int>, SHO <int>,
## # SV <int>, IPouts <int>, ER <int>, BAOpp <dbl>, ERA <dbl>, WP <int>,
## # BK <int>, BFP <int>, GF <int>