For this first assignment, I am creating a dataframe from a 538 data repository and posting it to my RPUBS account.
My objective is to isolate the playoff games from this dataset.
Load the data into a dataframe and display the internal structure of this dataframe object.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
five38_mlb <- read_csv("https://projects.fivethirtyeight.com/mlb-api/mlb_elo.csv")
## Rows: 228326 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): playoff, team1, team2, pitcher1, pitcher2
## dbl (20): season, neutral, elo1_pre, elo2_pre, elo_prob1, elo_prob2, elo1_p...
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(five38_mlb)
## spc_tbl_ [228,326 × 26] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ date : Date[1:228326], format: "2023-10-01" "2023-10-01" ...
## $ season : num [1:228326] 2023 2023 2023 2023 2023 ...
## $ neutral : num [1:228326] 0 0 0 0 0 0 0 0 0 0 ...
## $ playoff : chr [1:228326] NA NA NA NA ...
## $ team1 : chr [1:228326] "STL" "SEA" "NYM" "MIL" ...
## $ team2 : chr [1:228326] "CIN" "TEX" "PHI" "CHC" ...
## $ elo1_pre : num [1:228326] 1500 1516 1506 1502 1423 ...
## $ elo2_pre : num [1:228326] 1485 1535 1523 1499 1542 ...
## $ elo_prob1 : num [1:228326] 0.555 0.507 0.51 0.539 0.367 ...
## $ elo_prob2 : num [1:228326] 0.445 0.493 0.49 0.461 0.633 ...
## $ elo1_post : num [1:228326] NA NA NA NA NA NA NA NA NA NA ...
## $ elo2_post : num [1:228326] NA NA NA NA NA NA NA NA NA NA ...
## $ rating1_pre : num [1:228326] 1501 1512 1525 1515 1421 ...
## $ rating2_pre : num [1:228326] 1473 1536 1524 1494 1555 ...
## $ pitcher1 : chr [1:228326] NA NA NA NA ...
## $ pitcher2 : chr [1:228326] NA NA NA NA ...
## $ pitcher1_rgs: num [1:228326] NA NA NA NA NA NA NA NA NA NA ...
## $ pitcher2_rgs: num [1:228326] NA NA NA NA NA NA NA NA NA NA ...
## $ pitcher1_adj: num [1:228326] NA NA NA NA NA NA NA NA NA NA ...
## $ pitcher2_adj: num [1:228326] NA NA NA NA NA NA NA NA NA NA ...
## $ rating_prob1: num [1:228326] 0.576 0.505 0.539 0.557 0.348 ...
## $ rating_prob2: num [1:228326] 0.424 0.495 0.461 0.443 0.652 ...
## $ rating1_post: num [1:228326] NA NA NA NA NA NA NA NA NA NA ...
## $ rating2_post: num [1:228326] NA NA NA NA NA NA NA NA NA NA ...
## $ score1 : num [1:228326] NA NA NA NA NA NA NA NA NA NA ...
## $ score2 : num [1:228326] NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, "spec")=
## .. cols(
## .. date = col_date(format = ""),
## .. season = col_double(),
## .. neutral = col_double(),
## .. playoff = col_character(),
## .. team1 = col_character(),
## .. team2 = col_character(),
## .. elo1_pre = col_double(),
## .. elo2_pre = col_double(),
## .. elo_prob1 = col_double(),
## .. elo_prob2 = col_double(),
## .. elo1_post = col_double(),
## .. elo2_post = col_double(),
## .. rating1_pre = col_double(),
## .. rating2_pre = col_double(),
## .. pitcher1 = col_character(),
## .. pitcher2 = col_character(),
## .. pitcher1_rgs = col_double(),
## .. pitcher2_rgs = col_double(),
## .. pitcher1_adj = col_double(),
## .. pitcher2_adj = col_double(),
## .. rating_prob1 = col_double(),
## .. rating_prob2 = col_double(),
## .. rating1_post = col_double(),
## .. rating2_post = col_double(),
## .. score1 = col_double(),
## .. score2 = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
five38_mlb_subset <- five38_mlb[ c("date", "season", "playoff", "team1", "team2", "score1", "score2")]
mlb_playoff_games <- na.omit(five38_mlb_subset)
str(mlb_playoff_games)
## tibble [1,747 × 7] (S3: tbl_df/tbl/data.frame)
## $ date : Date[1:1747], format: "2022-11-05" "2022-11-03" ...
## $ season : num [1:1747] 2022 2022 2022 2022 2022 ...
## $ playoff: chr [1:1747] "w" "w" "w" "w" ...
## $ team1 : chr [1:1747] "HOU" "PHI" "PHI" "PHI" ...
## $ team2 : chr [1:1747] "PHI" "HOU" "HOU" "HOU" ...
## $ score1 : num [1:1747] 4 2 0 7 5 5 5 4 10 0 ...
## $ score2 : num [1:1747] 1 3 5 0 2 6 6 3 6 5 ...
## - attr(*, "na.action")= 'omit' Named int [1:226579] 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "names")= chr [1:226579] "1" "2" "3" "4" ...
mlb_playoff_games <- mlb_playoff_games |>
rename(
Date_of_game = 'date',
Year_of_season = 'season',
Playoff_Round = 'playoff',
Home_Team = 'team1',
Away_Team = 'team2',
Home_Team_Score = 'score1',
Away_Team_Score = 'score2'
)
mlb_playoff_games |>
mutate(
Playoff_Round = recode(Playoff_Round,
'w' = "World_Series",
'l' = 'Leage_Championship',
'd' = 'Division_Series',
'c' = "Wild_Card")
)
## # A tibble: 1,747 × 7
## Date_of_game Year_of_season Playoff_Round Home_Team Away_Team Home_Team_Score
## <date> <dbl> <chr> <chr> <chr> <dbl>
## 1 2022-11-05 2022 World_Series HOU PHI 4
## 2 2022-11-03 2022 World_Series PHI HOU 2
## 3 2022-11-02 2022 World_Series PHI HOU 0
## 4 2022-11-01 2022 World_Series PHI HOU 7
## 5 2022-10-29 2022 World_Series HOU PHI 5
## 6 2022-10-28 2022 World_Series HOU PHI 5
## 7 2022-10-23 2022 Leage_Champi… NYY HOU 5
## 8 2022-10-23 2022 Leage_Champi… PHI SDP 4
## 9 2022-10-22 2022 Leage_Champi… PHI SDP 10
## 10 2022-10-22 2022 Leage_Champi… NYY HOU 0
## # ℹ 1,737 more rows
## # ℹ 1 more variable: Away_Team_Score <dbl>