Assigment 2 (Part 1)

From Lahman baseball database library(Lahman). Get the number of distinct players in the HallOfFame data set

library(Lahman)

distinct_players <- unique(HallOfFame$playerID)
number_distinct_players <- length(distinct_players)
number_distinct_players
## [1] 1344

Find distinct players (category == “Player”) in HallOfFame with inducted == “Y”

inducted_players <- subset(HallOfFame, category == "Player" & inducted == "Y")
unique_inducted_players <- unique(inducted_players$playerID)
number_inducted_players <- length(unique_inducted_players)
number_inducted_players
## [1] 270

Create a HOFplayers data frame for players (category == “Player”)

voted by BBWAA

HOF_players <- subset(HallOfFame, category == "Player" & votedBy == "BBWAA")
class(HOF_players)
## [1] "data.frame"

Remove all NAs from your HOFplayers\(votes column and Bonus: Also remove all zeros from your HOFplayers\)votes column

HOF_players <- HOF_players[!is.na(HOF_players$votes) & HOF_players$votes != 0,]
tail(HOF_players)
##       playerID yearID votedBy ballots needed votes inducted category
## 4310 howarry01   2022   BBWAA     394    296     8        N   Player
## 4311 teixema01   2022   BBWAA     394    296     6        N   Player
## 4312 papeljo01   2022   BBWAA     394    296     5        N   Player
## 4313 morneju01   2022   BBWAA     394    296     5        N   Player
## 4314 pierzaj01   2022   BBWAA     394    296     2        N   Player
## 4315 fieldpr01   2022   BBWAA     394    296     2        N   Player
##      needed_note
## 4310        <NA>
## 4311        <NA>
## 4312        <NA>
## 4313        <NA>
## 4314        <NA>
## 4315        <NA>

Assignment 2 (Part 2)

Create a function which returns “Elected” if inducted == “Y” and “Not elected” otherwise

election_status <- function(inducted) {
  if (inducted == "Y") {
    return("Elected")
  } else {
    return("Not elected")
  }
}

test_election_status <- HallOfFame$inducted[10]
print(election_status(test_election_status))
## [1] "Not elected"

Create “elected” column in your HOFplayers by applying your function

HOF_players$elected <- sapply(HOF_players$inducted, election_status)
head(HOF_players)
##    playerID yearID votedBy ballots needed votes inducted category needed_note
## 1  cobbty01   1936   BBWAA     226    170   222        Y   Player        <NA>
## 2  ruthba01   1936   BBWAA     226    170   215        Y   Player        <NA>
## 3 wagneho01   1936   BBWAA     226    170   215        Y   Player        <NA>
## 4 mathech01   1936   BBWAA     226    170   205        Y   Player        <NA>
## 5 johnswa01   1936   BBWAA     226    170   189        Y   Player        <NA>
## 6 lajoina01   1936   BBWAA     226    170   146        N   Player        <NA>
##       elected
## 1     Elected
## 2     Elected
## 3     Elected
## 4     Elected
## 5     Elected
## 6 Not elected

Add pct = 100 * round(votes/ballots, 2)) variable to HOFplayers data frame calculated by player

Hint - group_by(playerID) %>%

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
HOF_players <- HOF_players %>%
  group_by(playerID) %>%
  mutate(pct = 100 * round(votes/ballots, 2))
head(HOF_players)
## # A tibble: 6 × 11
## # Groups:   playerID [6]
##   playerID  yearID votedBy ballots needed votes inducted category needed_note
##   <chr>      <int> <chr>     <dbl>  <dbl> <dbl> <fct>    <fct>    <chr>      
## 1 cobbty01    1936 BBWAA       226    170   222 Y        Player   <NA>       
## 2 ruthba01    1936 BBWAA       226    170   215 Y        Player   <NA>       
## 3 wagneho01   1936 BBWAA       226    170   215 Y        Player   <NA>       
## 4 mathech01   1936 BBWAA       226    170   205 Y        Player   <NA>       
## 5 johnswa01   1936 BBWAA       226    170   189 Y        Player   <NA>       
## 6 lajoina01   1936 BBWAA       226    170   146 N        Player   <NA>       
## # ℹ 2 more variables: elected <chr>, pct <dbl>

Plot histogram of pct for elected == “Elected” only

elected_data <- HOF_players[HOF_players$elected == "Elected",]

hist(elected_data$pct, main = "Histogram of pct for Elected Players", 
     xlab = "Percentage of Votes", 
     ylab = "Frequency", 
     col = "lightblue", 
     border = "black")

Bonus: Make your histogram nicer, use ggplot

library(ggplot2)

ggplot(elected_data, aes(x = pct)) +
  geom_histogram(binwidth = 2, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of pct for Elected Players",
       x = "Percentage of Votes",
       y = "Number of Players") +
  theme_minimal()