maybe:
https://stackoverflow.com/questions/36230790/missing-horizontal-scroll-bar-in-r-markdown-html-code-chunks-and-output
css: report_styles.css
pre, code {white-space:pre !important; overflow-x:scroll !important}
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Package ‘Lahman’: https://cran.r-project.org/web/packages/Lahman/Lahman.pdf
#install.packages("Lahman")
library(Lahman)
# install.packages("vcdExtra")
# vcdExtra::datasets(package="Lahman")
# data(AllstarFull) -- etc
name1 <- names(AllstarFull)
name2 <- names(Appearances)
name3 <- names(AwardsManagers)
name4 <- names(AwardsPlayers)
name5 <- names(AwardsShareManagers)
name6 <- names(AwardsSharePlayers)
name7 <- names(Batting)
name8 <- names(BattingPost)
name9 <- names(CollegePlaying)
name10 <- names(Fielding)
name11 <- names(FieldingOF)
name12 <- names(FieldingPost)
name13 <- names(HallOfFame)
name14 <- names(LahmanData)
name15 <- names(Managers)
name16 <- names(ManagersHalf)
name17 <- names(Master)
name18 <- names(Pitching)
name19 <- names(PitchingPost)
name20 <- names(Salaries)
name21 <- names(Schools)
name22 <- names(SeriesPost)
name23 <- names(Teams)
name24 <- names(TeamsFranchises)
name25 <- names(TeamsHalf)
name26 <- names(battingLabels)
name27 <- names(fieldingLabels)
name28 <- names(pitchingLabels)
library(tibble)
lahmanNames <- list(
AllstarFull = tibble(var = name1),
Appearances = tibble(var = name2),
AwardsManagers = tibble(var = name3),
AwardsPlayers = tibble(var = name4),
AwardsShareManagers = tibble(var = name5),
AwardsSharePlayers = tibble(var = name6),
Batting = tibble(var = name7),
BattingPost = tibble(var = name8),
CollegePlaying = tibble(var = name9),
Fielding = tibble(var = name10),
FieldingOF = tibble(var = name11),
FieldingPost = tibble(var = name12),
HallOfFame = tibble(var = name13),
LahmanData = tibble(var = name14),
Managers = tibble(var = name15),
ManagersHalf = tibble(var = name16),
Master = tibble(var = name17),
Pitching = tibble(var = name18),
PitchingPost = tibble(var = name19),
Salaries = tibble(var = name20),
Schools = tibble(var = name21),
SeriesPost = tibble(var = name22),
Teams = tibble(var = name23),
TeamsFranchises = tibble(var = name24),
TeamsHalf = tibble(var = name25),
battingLabels = tibble(var = name26),
fieldingLabels = tibble(var = name27),
pitchingLabels = tibble(var = name28)
)
str(lahmanNames)
## List of 28
## $ AllstarFull :Classes 'tbl_df', 'tbl' and 'data.frame': 8 obs. of 1 variable:
## ..$ var: chr [1:8] "playerID" "yearID" "gameNum" "gameID" ...
## $ Appearances :Classes 'tbl_df', 'tbl' and 'data.frame': 21 obs. of 1 variable:
## ..$ var: chr [1:21] "yearID" "teamID" "lgID" "playerID" ...
## $ AwardsManagers :Classes 'tbl_df', 'tbl' and 'data.frame': 6 obs. of 1 variable:
## ..$ var: chr [1:6] "playerID" "awardID" "yearID" "lgID" ...
## $ AwardsPlayers :Classes 'tbl_df', 'tbl' and 'data.frame': 6 obs. of 1 variable:
## ..$ var: chr [1:6] "playerID" "awardID" "yearID" "lgID" ...
## $ AwardsShareManagers:Classes 'tbl_df', 'tbl' and 'data.frame': 7 obs. of 1 variable:
## ..$ var: chr [1:7] "awardID" "yearID" "lgID" "playerID" ...
## $ AwardsSharePlayers :Classes 'tbl_df', 'tbl' and 'data.frame': 7 obs. of 1 variable:
## ..$ var: chr [1:7] "awardID" "yearID" "lgID" "playerID" ...
## $ Batting :Classes 'tbl_df', 'tbl' and 'data.frame': 22 obs. of 1 variable:
## ..$ var: chr [1:22] "playerID" "yearID" "stint" "teamID" ...
## $ BattingPost :Classes 'tbl_df', 'tbl' and 'data.frame': 22 obs. of 1 variable:
## ..$ var: chr [1:22] "yearID" "round" "playerID" "teamID" ...
## $ CollegePlaying :Classes 'tbl_df', 'tbl' and 'data.frame': 3 obs. of 1 variable:
## ..$ var: chr [1:3] "playerID" "schoolID" "yearID"
## $ Fielding :Classes 'tbl_df', 'tbl' and 'data.frame': 18 obs. of 1 variable:
## ..$ var: chr [1:18] "playerID" "yearID" "stint" "teamID" ...
## $ FieldingOF :Classes 'tbl_df', 'tbl' and 'data.frame': 6 obs. of 1 variable:
## ..$ var: chr [1:6] "playerID" "yearID" "stint" "Glf" ...
## $ FieldingPost :Classes 'tbl_df', 'tbl' and 'data.frame': 17 obs. of 1 variable:
## ..$ var: chr [1:17] "playerID" "yearID" "teamID" "lgID" ...
## $ HallOfFame :Classes 'tbl_df', 'tbl' and 'data.frame': 9 obs. of 1 variable:
## ..$ var: chr [1:9] "playerID" "yearID" "votedBy" "ballots" ...
## $ LahmanData :Classes 'tbl_df', 'tbl' and 'data.frame': 5 obs. of 1 variable:
## ..$ var: chr [1:5] "file" "class" "nobs" "nvar" ...
## $ Managers :Classes 'tbl_df', 'tbl' and 'data.frame': 10 obs. of 1 variable:
## ..$ var: chr [1:10] "playerID" "yearID" "teamID" "lgID" ...
## $ ManagersHalf :Classes 'tbl_df', 'tbl' and 'data.frame': 10 obs. of 1 variable:
## ..$ var: chr [1:10] "playerID" "yearID" "teamID" "lgID" ...
## $ Master :Classes 'tbl_df', 'tbl' and 'data.frame': 26 obs. of 1 variable:
## ..$ var: chr [1:26] "playerID" "birthYear" "birthMonth" "birthDay" ...
## $ Pitching :Classes 'tbl_df', 'tbl' and 'data.frame': 30 obs. of 1 variable:
## ..$ var: chr [1:30] "playerID" "yearID" "stint" "teamID" ...
## $ PitchingPost :Classes 'tbl_df', 'tbl' and 'data.frame': 30 obs. of 1 variable:
## ..$ var: chr [1:30] "playerID" "yearID" "round" "teamID" ...
## $ Salaries :Classes 'tbl_df', 'tbl' and 'data.frame': 5 obs. of 1 variable:
## ..$ var: chr [1:5] "yearID" "teamID" "lgID" "playerID" ...
## $ Schools :Classes 'tbl_df', 'tbl' and 'data.frame': 5 obs. of 1 variable:
## ..$ var: chr [1:5] "schoolID" "name_full" "city" "state" ...
## $ SeriesPost :Classes 'tbl_df', 'tbl' and 'data.frame': 9 obs. of 1 variable:
## ..$ var: chr [1:9] "yearID" "round" "teamIDwinner" "lgIDwinner" ...
## $ Teams :Classes 'tbl_df', 'tbl' and 'data.frame': 48 obs. of 1 variable:
## ..$ var: chr [1:48] "yearID" "lgID" "teamID" "franchID" ...
## $ TeamsFranchises :Classes 'tbl_df', 'tbl' and 'data.frame': 4 obs. of 1 variable:
## ..$ var: chr [1:4] "franchID" "franchName" "active" "NAassoc"
## $ TeamsHalf :Classes 'tbl_df', 'tbl' and 'data.frame': 10 obs. of 1 variable:
## ..$ var: chr [1:10] "yearID" "lgID" "teamID" "Half" ...
## $ battingLabels :Classes 'tbl_df', 'tbl' and 'data.frame': 2 obs. of 1 variable:
## ..$ var: chr [1:2] "variable" "label"
## $ fieldingLabels :Classes 'tbl_df', 'tbl' and 'data.frame': 2 obs. of 1 variable:
## ..$ var: chr [1:2] "variable" "label"
## $ pitchingLabels :Classes 'tbl_df', 'tbl' and 'data.frame': 2 obs. of 1 variable:
## ..$ var: chr [1:2] "variable" "label"
list of 26 data frames, each of which contains the variable names of one dataset in the Lahman package.
lahmanNames
## $AllstarFull
## # A tibble: 8 x 1
## var
## <chr>
## 1 playerID
## 2 yearID
## 3 gameNum
## 4 gameID
## 5 teamID
## 6 lgID
## 7 GP
## 8 startingPos
##
## $Appearances
## # A tibble: 21 x 1
## var
## <chr>
## 1 yearID
## 2 teamID
## 3 lgID
## 4 playerID
## 5 G_all
## 6 GS
## 7 G_batting
## 8 G_defense
## 9 G_p
## 10 G_c
## # ... with 11 more rows
##
## $AwardsManagers
## # A tibble: 6 x 1
## var
## <chr>
## 1 playerID
## 2 awardID
## 3 yearID
## 4 lgID
## 5 tie
## 6 notes
##
## $AwardsPlayers
## # A tibble: 6 x 1
## var
## <chr>
## 1 playerID
## 2 awardID
## 3 yearID
## 4 lgID
## 5 tie
## 6 notes
##
## $AwardsShareManagers
## # A tibble: 7 x 1
## var
## <chr>
## 1 awardID
## 2 yearID
## 3 lgID
## 4 playerID
## 5 pointsWon
## 6 pointsMax
## 7 votesFirst
##
## $AwardsSharePlayers
## # A tibble: 7 x 1
## var
## <chr>
## 1 awardID
## 2 yearID
## 3 lgID
## 4 playerID
## 5 pointsWon
## 6 pointsMax
## 7 votesFirst
##
## $Batting
## # A tibble: 22 x 1
## var
## <chr>
## 1 playerID
## 2 yearID
## 3 stint
## 4 teamID
## 5 lgID
## 6 G
## 7 AB
## 8 R
## 9 H
## 10 X2B
## # ... with 12 more rows
##
## $BattingPost
## # A tibble: 22 x 1
## var
## <chr>
## 1 yearID
## 2 round
## 3 playerID
## 4 teamID
## 5 lgID
## 6 G
## 7 AB
## 8 R
## 9 H
## 10 X2B
## # ... with 12 more rows
##
## $CollegePlaying
## # A tibble: 3 x 1
## var
## <chr>
## 1 playerID
## 2 schoolID
## 3 yearID
##
## $Fielding
## # A tibble: 18 x 1
## var
## <chr>
## 1 playerID
## 2 yearID
## 3 stint
## 4 teamID
## 5 lgID
## 6 POS
## 7 G
## 8 GS
## 9 InnOuts
## 10 PO
## 11 A
## 12 E
## 13 DP
## 14 PB
## 15 WP
## 16 SB
## 17 CS
## 18 ZR
##
## $FieldingOF
## # A tibble: 6 x 1
## var
## <chr>
## 1 playerID
## 2 yearID
## 3 stint
## 4 Glf
## 5 Gcf
## 6 Grf
##
## $FieldingPost
## # A tibble: 17 x 1
## var
## <chr>
## 1 playerID
## 2 yearID
## 3 teamID
## 4 lgID
## 5 round
## 6 POS
## 7 G
## 8 GS
## 9 InnOuts
## 10 PO
## 11 A
## 12 E
## 13 DP
## 14 TP
## 15 PB
## 16 SB
## 17 CS
##
## $HallOfFame
## # A tibble: 9 x 1
## var
## <chr>
## 1 playerID
## 2 yearID
## 3 votedBy
## 4 ballots
## 5 needed
## 6 votes
## 7 inducted
## 8 category
## 9 needed_note
##
## $LahmanData
## # A tibble: 5 x 1
## var
## <chr>
## 1 file
## 2 class
## 3 nobs
## 4 nvar
## 5 title
##
## $Managers
## # A tibble: 10 x 1
## var
## <chr>
## 1 playerID
## 2 yearID
## 3 teamID
## 4 lgID
## 5 inseason
## 6 G
## 7 W
## 8 L
## 9 rank
## 10 plyrMgr
##
## $ManagersHalf
## # A tibble: 10 x 1
## var
## <chr>
## 1 playerID
## 2 yearID
## 3 teamID
## 4 lgID
## 5 inseason
## 6 half
## 7 G
## 8 W
## 9 L
## 10 rank
##
## $Master
## # A tibble: 26 x 1
## var
## <chr>
## 1 playerID
## 2 birthYear
## 3 birthMonth
## 4 birthDay
## 5 birthCountry
## 6 birthState
## 7 birthCity
## 8 deathYear
## 9 deathMonth
## 10 deathDay
## # ... with 16 more rows
##
## $Pitching
## # A tibble: 30 x 1
## var
## <chr>
## 1 playerID
## 2 yearID
## 3 stint
## 4 teamID
## 5 lgID
## 6 W
## 7 L
## 8 G
## 9 GS
## 10 CG
## # ... with 20 more rows
##
## $PitchingPost
## # A tibble: 30 x 1
## var
## <chr>
## 1 playerID
## 2 yearID
## 3 round
## 4 teamID
## 5 lgID
## 6 W
## 7 L
## 8 G
## 9 GS
## 10 CG
## # ... with 20 more rows
##
## $Salaries
## # A tibble: 5 x 1
## var
## <chr>
## 1 yearID
## 2 teamID
## 3 lgID
## 4 playerID
## 5 salary
##
## $Schools
## # A tibble: 5 x 1
## var
## <chr>
## 1 schoolID
## 2 name_full
## 3 city
## 4 state
## 5 country
##
## $SeriesPost
## # A tibble: 9 x 1
## var
## <chr>
## 1 yearID
## 2 round
## 3 teamIDwinner
## 4 lgIDwinner
## 5 teamIDloser
## 6 lgIDloser
## 7 wins
## 8 losses
## 9 ties
##
## $Teams
## # A tibble: 48 x 1
## var
## <chr>
## 1 yearID
## 2 lgID
## 3 teamID
## 4 franchID
## 5 divID
## 6 Rank
## 7 G
## 8 Ghome
## 9 W
## 10 L
## # ... with 38 more rows
##
## $TeamsFranchises
## # A tibble: 4 x 1
## var
## <chr>
## 1 franchID
## 2 franchName
## 3 active
## 4 NAassoc
##
## $TeamsHalf
## # A tibble: 10 x 1
## var
## <chr>
## 1 yearID
## 2 lgID
## 3 teamID
## 4 Half
## 5 divID
## 6 DivWin
## 7 Rank
## 8 G
## 9 W
## 10 L
##
## $battingLabels
## # A tibble: 2 x 1
## var
## <chr>
## 1 variable
## 2 label
##
## $fieldingLabels
## # A tibble: 2 x 1
## var
## <chr>
## 1 variable
## 2 label
##
## $pitchingLabels
## # A tibble: 2 x 1
## var
## <chr>
## 1 variable
## 2 label
Use purrr’s reduce() and a single set operation to return a data frame that contains every variable that appears in all 26 datasets (if any does).
library(purrr)
# Find variables in common
# ref intersect() set operation
reduce(lahmanNames, intersect) # returns an error
# output seen in course:
## A tibble: 0 × 1
## ... with 1 variables: var <chr>
No variables span all 26 datasets!
reduce(list(lahmanNames$AllstarFull,lahmanNames$Appearances), intersect)
intersect(lahmanNames$AllstarFull,lahmanNames$Appearances)
as.vector(lahmanNames$AllstarFull)
intersect(as.vector(lahmanNames$AllstarFull),as.vector(lahmanNames$Appearances))
as.character(lahmanNames$AllstarFull)
## [1] "c(\"playerID\", \"yearID\", \"gameNum\", \"gameID\", \"teamID\", \"lgID\", \"GP\", \"startingPos\")"
KH1 <- as.character(lahmanNames$AllstarFull)
KH2 <- as.character(lahmanNames$Appearances)
unlist(KH2, use.names = FALSE)
## [1] "c(\"yearID\", \"teamID\", \"lgID\", \"playerID\", \"G_all\", \"GS\", \"G_batting\", \"G_defense\", \"G_p\", \"G_c\", \"G_1b\", \"G_2b\", \"G_3b\", \"G_ss\", \"G_lf\", \"G_cf\", \"G_rf\", \"G_of\", \"G_dh\", \"G_ph\", \"G_pr\")"
intersect(KH1, KH2)
## character(0)
intersect(c(1,2,3), c(3,4,5))
## [1] 3
intersect(as.vector(lahmanNames$AllstarFull),as.vector(lahmanNames$Appearances))
intersect(as.vector(lahmanNames$AllstarFull[1:2,1]),as.vector(lahmanNames$AllstarFull[5:6]))
## Error: Column indexes must be at most 1 if positive, not 5, 6
as.numeric(as.matrix(lahmanNames$AllstarFull))
## Warning: NAs introduced by coercion
## [1] NA NA NA NA NA NA NA NA
intersect(as.matrix(lahmanNames$AllstarFull),as.matrix(lahmanNames$Appearances))
## [1] "playerID" "yearID" "teamID" "lgID"
reduce(as.matrix(unlist(lahmanNames)), intersect)
## character(0)
These variables provide relationships between the various tables in Lahman
lahmanNames %>%
# Bind the data frames in lahmanNames
# into a single table
# that includes a new column
# called dataframe,
# which contains the names of the data frames.
bind_rows(.id = 'dataframe')
lahmanNames %>%
# Bind the data frames in lahmanNames
bind_rows(.id = 'dataframe') %>%
# into a single table
# that includes a new column
# called dataframe,
# which contains the names of the data frames.
# Group the result by var
group_by(var)
lahmanNames %>%
# Bind the data frames in lahmanNames
bind_rows(.id = 'dataframe') %>%
# into a single table
# that includes a new column
# called dataframe,
# which contains the names of the data frames.
# Group the result by var
group_by(var) %>%
# Tally the number of appearances
tally()
lahmanNames %>%
# Bind the data frames in lahmanNames
bind_rows(.id = 'dataframe') %>%
# into a single table
# that includes a new column
# called dataframe,
# which contains the names of the data frames.
# Group the result by var
group_by(var) %>%
# Tally the number of appearances
tally() %>%
# Filter the data
# to only variables that appear more than once
filter(n > 1)
lahmanNames %>%
# Bind the data frames in lahmanNames
bind_rows(.id = 'dataframe') %>%
# into a single table
# that includes a new column
# called dataframe,
# which contains the names of the data frames.
# Group the result by var
group_by(var) %>%
# Tally the number of appearances
tally() %>%
# Filter the data
filter(n > 1) %>%
# to only variables that appear more than once
# Arrange the results
# most frequently used variables appear before less frequently used variables.
arrange(desc(n))
most widely used keys
lahmanNames %>%
# Bind the data frames
# into a single table
# that includes the column dataframe,
# which contains the names of the data frames.
bind_rows(.id = 'dataframe')
lahmanNames %>%
# Bind the data frames
# into a single table
# that includes the column dataframe,
# which contains the names of the data frames.
bind_rows(.id = 'dataframe') %>%
# Filter the results
filter(var == "playerID")
$lahmanNames %>%
# Bind the data frames
# into a single table
# that includes the column dataframe,
# which contains the names of the data frames.
bind_rows(.id = 'dataframe') %>%
# Filter the results
filter(var == "playerID") %>%
# Extract the dataframe variable
`$`("dataframe")
## [1] "AllstarFull" "Appearances" "AwardsManagers"
## [4] "AwardsPlayers" "AwardsShareManagers" "AwardsSharePlayers"
## [7] "Batting" "BattingPost" "CollegePlaying"
## [10] "Fielding" "FieldingOF" "FieldingPost"
## [13] "HallOfFame" "Managers" "ManagersHalf"
## [16] "Master" "Pitching" "PitchingPost"
## [19] "Salaries"
verify salary information for all players i.e. no systematic holes in our coverage
=> need: list of every player (to compare against salaries)
distinct() – find unique rows for all columns in a table.
players <- Master %>% # may contain multiple rows
# for each player
# Return one row for each distinct player
# & three columns:
# playerID
# nameFirst
# nameLast
distinct(playerID, nameFirst, nameLast)
# players contains a concise list of player IDs and names
head(players)
players %>%
# Find all players who do not appear in Salaries
anti_join(Salaries)
## Joining, by = "playerID"
players %>%
# Find all players who do not appear in Salaries
anti_join(Salaries) %>%
# how many players do not appear in Salaries
# dplyr function count()
count()
## Joining, by = "playerID"
# i.e. we are missing the salaries for 13,888 players
Explain the hole in the salary data!
Is it possible that these players somehow did not play (and hence did not earn a salary)?
Appearances contains information about every game played in major league baseball. (i.e. if a player played a game, it would show up as a row in Appearances)
players %>%
anti_join(Salaries, by = "playerID")
players %>%
anti_join(Salaries, by = "playerID") %>%
# Add a join to the pipe
# to determine how many (if any) unsalaried players
# played at least one game.
#
# How many unsalaried players appear in Appearances?
semi_join(Appearances, by = "playerID")
players %>%
anti_join(Salaries, by = "playerID") %>%
# Add a join to the pipe
# to determine how many (if any) unsalaried players
# played at least one game.
#
# How many unsalaried players appear in Appearances?
semi_join(Appearances, by = "playerID") %>%
count()
# 13,695 players played a game but are missing salary information.
# Interestingly, 193 players neither played a game nor have a recorded salary.
how many games each of these unsalaried players played?
players %>%
# Find all players who do not appear in Salaries
anti_join(Salaries, by = "playerID")
players %>%
# Find all players who do not appear in Salaries
anti_join(Salaries, by = "playerID") %>%
# Join them to Appearances
left_join(Appearances, by = "playerID")
players %>%
# Find all players who do not appear in Salaries
anti_join(Salaries, by = "playerID") %>%
# Join them to Appearances
left_join(Appearances, by = "playerID") %>%
# Calculate total_games for each player
group_by(playerID)
players %>%
# Find all players who do not appear in Salaries
anti_join(Salaries, by = "playerID") %>%
# Join them to Appearances
left_join(Appearances, by = "playerID") %>%
# Calculate total_games for each player
group_by(playerID) %>%
summarize(total_games = sum(G_all, na.rm = TRUE))
players %>%
# Find all players who do not appear in Salaries
anti_join(Salaries, by = "playerID") %>%
# Join them to Appearances
left_join(Appearances, by = "playerID") %>%
# Calculate total_games for each player
group_by(playerID) %>%
summarize(total_games = sum(G_all, na.rm = TRUE)) %>%
# Arrange in descending order by total_games
arrange(desc(total_games))
Is it possible that the unsalaried players did not actually play in the games that they appeared in?
players %>%
# Find unsalaried players
anti_join(Salaries, by = "playerID") %>%
# Join Batting to the unsalaried players
left_join(Batting, by = "playerID") %>%
# Group by player
group_by(playerID) %>%
# Sum at-bats for each player
summarize(total_at_bat = sum(AB, na.rm = TRUE)) %>%
# Arrange in descending order
arrange(desc(total_at_bat))
# The unpaid players definitely participated in the games
# Ty Cobb and Hank Aaron in the top three at-bats
# i.e. dealing with missing data here and not unsalaried players
collection of distinguished baseball players selected by a committee of baseball experts
names(HallOfFame)
## [1] "playerID" "yearID" "votedBy" "ballots" "needed"
## [6] "votes" "inducted" "category" "needed_note"
head(HallOfFame)
str(HallOfFame)
## 'data.frame': 4156 obs. of 9 variables:
## $ playerID : chr "cobbty01" "ruthba01" "wagneho01" "mathech01" ...
## $ yearID : int 1936 1936 1936 1936 1936 1936 1936 1936 1936 1936 ...
## $ votedBy : chr "BBWAA" "BBWAA" "BBWAA" "BBWAA" ...
## $ ballots : int 226 226 226 226 226 226 226 226 226 226 ...
## $ needed : int 170 170 170 170 170 170 170 170 170 170 ...
## $ votes : int 222 215 215 205 189 146 133 111 105 80 ...
## $ inducted : Factor w/ 2 levels "N","Y": 2 2 2 2 2 1 1 1 1 1 ...
## $ category : Factor w/ 4 levels "Manager","Pioneer/Executive",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ needed_note: chr NA NA NA NA ...
# Find the distinct players (by playerID) that appear in HallOfFame
nominated <- HallOfFame %>%
distinct(playerID)
head(nominated)
nominated %>%
# Count the number of players in nominated
count()
# there were 1,239 nominees for the Hall of Fame.
nominated_full <- nominated %>%
# Join to Master
left_join(Master, by = "playerID")
nominated_full <- nominated %>%
# Join to Master
left_join(Master, by = "playerID") %>%
# Return playerID, nameFirst, nameLast
select(playerID, nameFirst, nameLast)
head(nominated_full)
# We now have a dataset of everyone nominated!
# Repeat the process in the last exercise
inducted <- HallOfFame %>%
filter(inducted == "Y") %>%
distinct(playerID)
# call filter() before distinct()
# to get the distinct players
# that were inducted into the hall of fame
head(inducted)
inducted %>%
# Count the number of players in inducted
count()
# 312 players have been inducted into the Hall of Fame out of 1,239 nominees
inducted_full <- inducted %>%
# Join to Master
left_join(Master, by = "playerID") %>%
# Return playerID, nameFirst, nameLast
select(playerID, nameFirst, nameLast)
head(inducted_full)
# We now also have datasets of everyone inducted and everyone nominated
Did nominees who were inducted earn more awards than nominees who were not inducted?
head(AwardsPlayers)
AwardsPlayers
# Tally the number of awards in AwardsPlayers by playerID
nAwards <- AwardsPlayers %>%
group_by(playerID) %>%
tally()
# resulting column containing the number of awards is called n by default
head(nAwards)
nAwards
nAwards %>%
# Filter to just the players in inducted
semi_join(inducted, by = "playerID")
nAwards %>%
# Filter to just the players in inducted
semi_join(inducted, by = "playerID") %>%
# Calculate the mean number of awards per player
summarize(avg_n = mean(n, na.rm = TRUE))
nAwards %>%
# Filter to just the players in nominated
semi_join(nominated, by = "playerID") %>%
# Filter to players NOT in inducted
anti_join(inducted, by = "playerID")
nAwards %>%
# Filter to just the players in nominated
semi_join(nominated, by = "playerID") %>%
# Filter to players NOT in inducted
anti_join(inducted, by = "playerID") %>%
# Calculate the mean number of awards per player
summarize(avg_n = mean(n, na.rm = TRUE))
# On Average, inductees had 11.95 - 4.23 = 7.72 more awards than non-inductees
i.e. another way to differentiate inductees from non-inductees
Does the maximum salary earned by inductees tend to be greater than the maximum salary earned by nominees who were not inducted?
# Find the players who are in nominated, but not inducted
notInducted <- nominated %>%
setdiff(inducted) # using a set operation
head(notInducted)
# Filter Salaries against notInducted to return only the salaries of nominees who were not inducted
Salaries %>%
# Find the players who are in notInducted
semi_join(notInducted, by = "playerID")
Salaries %>%
# Find the players who are in notInducted
semi_join(notInducted, by = "playerID") %>%
# Calculate the max salary by player
group_by(playerID) %>%
summarize(max_salary = max(salary, na.rm = TRUE)) %>%
# Calculate the average of the max salaries
summarize(avg_salary = mean(max_salary, na.rm = TRUE))
# Repeat for players who were inducted
Salaries %>%
semi_join(inducted, by = "playerID")
# Repeat for players who were inducted
Salaries %>%
semi_join(inducted, by = "playerID") %>%
group_by(playerID) %>%
summarize(max_salary = max(salary, na.rm = TRUE)) %>%
summarize(avg_salary = mean(max_salary, na.rm = TRUE))
# average salary of players who were inducted was $5,079,720 - $4,677,737 = $401,983 more per year
# We know that we have some missing salary information,
# but what else could be wrong with this analysis?
One of the rules of the Hall of Fame is that players cannot be nominated until five years after they retire
Is this reflected in our data?
head(Appearances)
Appearances %>%
# Filter Appearances against nominated
semi_join(nominated, by = "playerID")
Appearances %>%
# Filter Appearances against nominated
semi_join(nominated, by = "playerID") %>%
# Find last year played by player - before retiring
group_by(playerID) %>%
summarize(last_year = max(yearID))
Appearances %>%
# Filter Appearances against nominated
semi_join(nominated, by = "playerID") %>%
# Find last year played by player
group_by(playerID) %>%
summarize(last_year = max(yearID)) %>%
# Join to full HallOfFame
left_join(HallOfFame, by = "playerID")
Appearances %>%
# Filter Appearances against nominated
semi_join(nominated, by = "playerID") %>%
# Find last year played by player
group_by(playerID) %>%
summarize(last_year = max(yearID)) %>%
# Join to full HallOfFame
left_join(HallOfFame, by = "playerID") %>%
# Filter for unusual observations
# players were nominated before they completely retired
filter(last_year >= yearID) # yearID variable from HallOfFame
# quite a few players have been nominated before they retired
# but this practice seems much less frequent in recent years.