Introduction

Retrosheet is a non-profit, all-volunteer organization with strong interests in baseball history. Founded in 1989, they have been computerizing play-by-play accounts of as many pre-1984 major league games as possible.

The main file that will be used in this project is a compilation of 127 separate play-by-play files from Retrosheet. It consists of hundreds of data points around each game. The goal is to normalize this data into several separate database tables and create a robust database of major league baseball statistics.

Main file: game_log.csv

Helper files: park_codes.csv, person_codes.csv, team_codes.csv

Data Exploration

Load the data using read_csv() from readr package of tidyverse.

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
game_log <- read_csv("game_log.csv", col_types = cols(v_league = "c", h_league = "c",
                                                   `3b_umpire_id` = "c", `3b_umpire_name` = "c",
                                                   `2b_umpire_id` = "c", `2b_umpire_name` = "c",
                                                   `lf_umpire_id` = "c", `lf_umpire_name` = "c",
                                                   `rf_umpire_id` = "c", `rf_umpire_name` = "c",
                                                   completion = "c", winning_rbi_batter_id = "c",
                                                   winning_rbi_batter_id_name = "c", protest = "c",
                                                   v_first_catcher_interference = "c", 
                                                   h_first_catcher_interference = "c"))
park_codes <- read_csv("park_codes.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   park_id = col_character(),
##   name = col_character(),
##   aka = col_character(),
##   city = col_character(),
##   state = col_character(),
##   start = col_character(),
##   end = col_character(),
##   league = col_character(),
##   notes = col_character()
## )
person_codes <- read_csv("person_codes.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   id = col_character(),
##   last = col_character(),
##   first = col_character(),
##   player_debut = col_character(),
##   mgr_debut = col_character(),
##   coach_debut = col_character(),
##   ump_debut = col_character()
## )
team_codes <- read_csv("team_codes.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   team_id = col_character(),
##   league = col_character(),
##   start = col_double(),
##   end = col_double(),
##   city = col_character(),
##   nickname = col_character(),
##   franch_id = col_character(),
##   seq = col_double()
## )
dim(game_log)
## [1] 171907    161
head(game_log, 10)
## # A tibble: 10 x 161
##        date number_of_game day_of_week v_name v_league v_game_number h_name
##       <dbl>          <dbl> <chr>       <chr>  <chr>            <dbl> <chr> 
##  1 18710504              0 Thu         CL1    <NA>                 1 FW1   
##  2 18710505              0 Fri         BS1    <NA>                 1 WS3   
##  3 18710506              0 Sat         CL1    <NA>                 2 RC1   
##  4 18710508              0 Mon         CL1    <NA>                 3 CH1   
##  5 18710509              0 Tue         BS1    <NA>                 2 TRO   
##  6 18710511              0 Thu         CH1    <NA>                 2 CL1   
##  7 18710513              0 Sat         WS3    <NA>                 2 CL1   
##  8 18710513              0 Sat         CH1    <NA>                 3 FW1   
##  9 18710515              0 Mon         WS3    <NA>                 3 FW1   
## 10 18710516              0 Tue         TRO    <NA>                 2 BS1   
## # ... with 154 more variables: h_league <chr>, h_game_number <dbl>,
## #   v_score <dbl>, h_score <dbl>, length_outs <dbl>, day_night <chr>,
## #   completion <chr>, forfeit <chr>, protest <chr>, park_id <chr>,
## #   attendance <dbl>, length_minutes <dbl>, v_line_score <chr>,
## #   h_line_score <chr>, v_at_bats <dbl>, v_hits <dbl>, v_doubles <dbl>,
## #   v_triples <dbl>, v_homeruns <dbl>, v_rbi <dbl>, v_sacrifice_hits <dbl>,
## #   v_sacrifice_flies <dbl>, v_hit_by_pitch <dbl>, v_walks <dbl>,
## #   v_intentional_walks <dbl>, v_strikeouts <dbl>, v_stolen_bases <dbl>,
## #   v_caught_stealing <dbl>, v_grounded_into_double <dbl>,
## #   v_first_catcher_interference <chr>, v_left_on_base <dbl>,
## #   v_pitchers_used <dbl>, v_individual_earned_runs <dbl>,
## #   v_team_earned_runs <dbl>, v_wild_pitches <dbl>, v_balks <dbl>,
## #   v_putouts <dbl>, v_assists <dbl>, v_errors <dbl>, v_passed_balls <dbl>,
## #   v_double_plays <dbl>, v_triple_plays <dbl>, h_at_bats <dbl>, h_hits <dbl>,
## #   h_doubles <dbl>, h_triples <dbl>, h_homeruns <dbl>, h_rbi <dbl>,
## #   h_sacrifice_hits <dbl>, h_sacrifice_flies <dbl>, h_hit_by_pitch <dbl>,
## #   h_walks <dbl>, h_intentional_walks <dbl>, h_strikeouts <dbl>,
## #   h_stolen_bases <dbl>, h_caught_stealing <dbl>,
## #   h_grounded_into_double <dbl>, h_first_catcher_interference <chr>,
## #   h_left_on_base <dbl>, h_pitchers_used <dbl>,
## #   h_individual_earned_runs <dbl>, h_team_earned_runs <dbl>,
## #   h_wild_pitches <dbl>, h_balks <dbl>, h_putouts <dbl>, h_assists <dbl>,
## #   h_errors <dbl>, h_passed_balls <dbl>, h_double_plays <dbl>,
## #   h_triple_plays <dbl>, hp_umpire_id <chr>, hp_umpire_name <chr>,
## #   1b_umpire_id <chr>, 1b_umpire_name <chr>, 2b_umpire_id <chr>,
## #   2b_umpire_name <chr>, 3b_umpire_id <chr>, 3b_umpire_name <chr>,
## #   lf_umpire_id <chr>, lf_umpire_name <chr>, rf_umpire_id <chr>,
## #   rf_umpire_name <chr>, v_manager_id <chr>, v_manager_name <chr>,
## #   h_manager_id <chr>, h_manager_name <chr>, winning_pitcher_id <chr>,
## #   winning_pitcher_name <chr>, losing_pitcher_id <chr>,
## #   losing_pitcher_name <chr>, saving_pitcher_id <chr>,
## #   saving_pitcher_name <chr>, winning_rbi_batter_id <chr>,
## #   winning_rbi_batter_id_name <chr>, v_starting_pitcher_id <chr>,
## #   v_starting_pitcher_name <chr>, h_starting_pitcher_id <chr>,
## #   h_starting_pitcher_name <chr>, v_player_1_id <chr>, v_player_1_name <chr>,
## #   ...

The game log tibble has 171,907 rows and 161 columns with data about baseball games arranged chronologically from 1871 to 2016.

Below is a gist of the information available: General game information Teams information Forfeit/protest
Scores Offensive, pitching, defensive stats Umpire, manager information Pitchers’ information Miscellaneous additional information *Acquisition information

Note: There is no primary key that can uniquely identify each game.

dim(park_codes)
## [1] 252   9
head(park_codes, 10)
## # A tibble: 10 x 9
##    park_id name       aka          city   state start  end   league notes       
##    <chr>   <chr>      <chr>        <chr>  <chr> <chr>  <chr> <chr>  <chr>       
##  1 ALB01   Riverside~ <NA>         Albany NY    09/11~ 05/3~ NL     TRN:9/11/80~
##  2 ALT01   Columbia ~ <NA>         Altoo~ PA    04/30~ 05/3~ UA     <NA>        
##  3 ANA01   Angel Sta~ Edison Fiel~ Anahe~ CA    04/19~ <NA>  AL     <NA>        
##  4 ARL01   Arlington~ <NA>         Arlin~ TX    04/21~ 10/0~ AL     <NA>        
##  5 ARL02   Rangers B~ The Ballpar~ Arlin~ TX    04/11~ <NA>  AL     <NA>        
##  6 ATL01   Atlanta-F~ <NA>         Atlan~ GA    04/12~ 09/2~ NL     <NA>        
##  7 ATL02   Turner Fi~ <NA>         Atlan~ GA    04/04~ 10/0~ NL     <NA>        
##  8 ATL03   Suntrust ~ <NA>         Atlan~ GA    04/14~ <NA>  NL     <NA>        
##  9 BAL01   Madison A~ <NA>         Balti~ MD    07/08~ 07/0~ <NA>   WS3         
## 10 BAL02   Newington~ <NA>         Balti~ MD    04/22~ 09/3~ <NA>   BL1:1872-74~

There are 252 rows and 9 columns in this tibble, listing out information about baseball parks. There is a primary key column whose values match that in game_log.

dim(person_codes)
## [1] 20494     7
head(person_codes, 10)
## # A tibble: 10 x 7
##    id       last        first    player_debut mgr_debut coach_debut ump_debut 
##    <chr>    <chr>       <chr>    <chr>        <chr>     <chr>       <chr>     
##  1 aardd001 Aardsma     David    04/06/2004   <NA>      <NA>        <NA>      
##  2 aaroh101 Aaron       Hank     04/13/1954   <NA>      <NA>        <NA>      
##  3 aarot101 Aaron       Tommie   04/10/1962   <NA>      04/06/1979  <NA>      
##  4 aased001 Aase        Don      07/26/1977   <NA>      <NA>        <NA>      
##  5 abada001 Abad        Andy     09/10/2001   <NA>      <NA>        <NA>      
##  6 abadf001 Abad        Fernando 07/28/2010   <NA>      <NA>        <NA>      
##  7 abadj101 Abadie      John     04/26/1875   <NA>      <NA>        <NA>      
##  8 abbae101 Abbaticchio Ed       09/04/1897   <NA>      <NA>        <NA>      
##  9 abbeb101 Abbey       Bert     06/14/1892   <NA>      <NA>        <NA>      
## 10 abbec101 Abbey       Charlie  08/16/1893   <NA>      <NA>        07/21/1897

There are 20,494 rows and 7 columns of data about people and their debut dates as player, manager, coach or umpire. It seems like one person may take up more than one of these 4 roles.

dim(team_codes)
## [1] 150   8
head(team_codes, 10)
## # A tibble: 10 x 8
##    team_id league start   end city      nickname        franch_id   seq
##    <chr>   <chr>  <dbl> <dbl> <chr>     <chr>           <chr>     <dbl>
##  1 ALT     UA      1884  1884 Altoona   Mountain Cities ALT           1
##  2 ARI     NL      1998     0 Arizona   Diamondbacks    ARI           1
##  3 BFN     NL      1879  1885 Buffalo   Bisons          BFN           1
##  4 BFP     PL      1890  1890 Buffalo   Bisons          BFP           1
##  5 BL1     <NA>    1872  1874 Baltimore Canaries        BL1           1
##  6 BL2     AA      1882  1891 Baltimore Orioles         BL2           1
##  7 BLN     NL      1892  1899 Baltimore Orioles         BL2           2
##  8 BL4     <NA>    1873  1873 Baltimore Marylands       BL4           1
##  9 BLA     AL      1901  1902 Baltimore Orioles         BLA           1
## 10 NYA     AL      1903     0 New York  Yankees         BLA           2

There are 150 rows and 8 columns of data listing team codes, and related information like franchise, nickname and city. The column team_id column appears in game_log as well.

Getting Familiar with Baseball

Defensive Positions There are separate columns for each player on the defense, listing their defensive position. Based on research from https://www.rookieroad.com/baseball/101/defense/ below are the defense positions:

Below is list of unique values from the defense position of player 1 of visiting team.

table(game_log$v_player_1_def_pos)
## 
##     1     2     3     4     5     6     7     8     9    10 
##     2   515  3383 28458 11923 22220 19595 38715 14965  1062

There is an additional 10th position listed with about 1062 rows with this value for this column. This may be a way to indicate an extra player as there are only 9 players on the field from the defense team. There cannot be a 10th position, so we can name that as an unknown position.

Leagues

Professional baseball consists of major league baseball, minor league baseball and independent leagues.

National League and American League are part of major league baseball while 6 and 8 leagues constitute minor league baseball and independent leagues respectively.

table(game_log$v_league, game_log$h_league)
##     
##         AA    AL    FL    NL    PL    UA
##   AA  5039     0     0     0     0     0
##   AL     0 72475     0  2238     0     0
##   FL     0     0  1243     0     0     0
##   NL     0  2237     0 86629     0     0
##   PL     0     0     0     0   532     0
##   UA     0     0     0     0     0   428
sum(is.na(game_log$h_league))
## [1] 1086

Game_log consists information of 8 leagues and over 1000 rows with NA value for league. The leagues we see are: * AA: American Association * AL: American League * FL: Federal League * NL: National League * PL: Players League * UA: Union Association

More research about the game can be conducted from https://www.rookieroad.com/baseball/101/

Importing Data into SQLite Database

Now that we have the data imported into R using read_csv(), we can import the data into a SQLite database using RSQLite.

library(RSQLite)
library(DBI)
conn <- dbConnect(SQLite(), "my_mlb.db")

dbWriteTable(conn = conn, name = "game_log", value = game_log, row.names = FALSE, header = TRUE)

dbWriteTable(conn = conn, name = "person_codes", value = person_codes, row.names = FALSE, header = TRUE)

dbWriteTable(conn = conn, name = "team_codes", value = team_codes, row.names = FALSE, header = TRUE)

dbWriteTable(conn = conn, name = "park_codes", value = park_codes, row.names = FALSE, header = TRUE)

dbListTables(conn)
## [1] "game_log"     "park_codes"   "person_codes" "team_codes"

dbWriteTable() is the easiest way to write local data frame/file to a database table, avoiding manual work and possible errors.

Create and update new column within game_log to act as primary key

alter_prikey <- "ALTER TABLE game_log
                 ADD COLUMN game_id;"

dbExecute(conn, alter_prikey)
update_prikey <- "UPDATE game_log 
                  SET game_id = CAST(date AS INTEGER) || h_name || CAST(number_of_game AS INTEGER);"

dbExecute(conn, update_prikey)

Checking Correctness

check_query <- "SELECT game_id, 
                      date,
                      h_name,
                      number_of_game
                FROM game_log
                LIMIT 5;"

dbGetQuery(conn, check_query)
##        game_id     date h_name number_of_game
## 1 18710504FW10 18710504    FW1              0
## 2 18710505WS30 18710505    WS3              0
## 3 18710506RC10 18710506    RC1              0
## 4 18710508CH10 18710508    CH1              0
## 5 18710509TRO0 18710509    TRO              0

Now, the column game_id will uniquely identify a row in game_log.

Data Normalization

Identifying opportunities

Schema

Based on the identified normalization opportunities, a schema for the design of a new database is as below:

Schema The game_log table initially contained major chunk of data that repeats and also contains irrelevant information with respect to game level. This data is now separated into a new tables - team_appearance where a column home identifies whether a row belongs to home team or visiting team and contains all related data. It is linked to the game table by game_id field. Similarly, person, park, league and other tables contain information that does not need to be repeated for every game row.

Further, as we create and insert values into these new tables, we can walk through the decisions made and the reasoning behind them.

Create and Insert Values into New Tables

Tables without foreign keys

The tables person, appearance_type, park and league do not have foreign keys. Let us create these tables and insert values into them so that the remaining tables can be created with foreign key references.

create_person <- "CREATE TABLE person(
                    person_id TEXT PRIMARY KEY,
                    first_name TEXT,
                    last_name TEXT
                  );"

dbExecute(conn, create_person)

insert_person <- "INSERT INTO person
                  SELECT id, first, last
                  FROM person_codes;
                 "
dbExecute(conn, insert_person)
check_person <- "SELECT * FROM person LIMIT 5;"

dbGetQuery(conn, check_person)
##   person_id first_name last_name
## 1  aardd001      David   Aardsma
## 2  aaroh101       Hank     Aaron
## 3  aarot101     Tommie     Aaron
## 4  aased001        Don      Aase
## 5  abada001       Andy      Abad

The person_id, first_name and last_name are the only fields required from person_codes table as debut information can be obtained from other tables.

create_park <- "CREATE TABLE park(
                  park_id TEXT PRIMARY KEY,
                  name TEXT,
                  nickname TEXT,
                  city TEXT,
                  state TEXT,
                  notes TEXT
                );"
dbExecute(conn, create_park)

insert_park <- "INSERT INTO park
                SELECT park_id, name, aka, city, state, notes
                FROM park_codes;
               "
dbExecute(conn, insert_park)
check_park <- "SELECT * FROM park LIMIT 5;"

dbGetQuery(conn, check_park)
##   park_id                          name
## 1   ALB01                Riverside Park
## 2   ALT01                 Columbia Park
## 3   ANA01      Angel Stadium of Anaheim
## 4   ARL01             Arlington Stadium
## 5   ARL02 Rangers Ballpark in Arlington
##                                   nickname      city state
## 1                                     <NA>    Albany    NY
## 2                                     <NA>   Altoona    PA
## 3            Edison Field; Anaheim Stadium   Anaheim    CA
## 4                                     <NA> Arlington    TX
## 5 The Ballpark in Arlington; Ameriquest Fl Arlington    TX
##                                            notes
## 1 TRN:9/11/80;6/15&9/10/1881;5/16-5/18&5/30/1882
## 2                                           <NA>
## 3                                           <NA>
## 4                                           <NA>
## 5                                           <NA>

The data from park_codes has been inserted into new park table without start and end date. They can be obtained from other tables which mention the first and last use of these parks.

create_league <- "CREATE TABLE league(
                  league_id TEXT PRIMARY KEY,
                  name TEXT
                 );"
dbExecute(conn, create_league)

insert_league <- 'INSERT INTO league
                  VALUES("AA", "American Association"),("AL", "American League"), ("FL", "Federal League"), ("NL", "National League"),                   ("PL", "Players League"),("UA", "Union Association");'
dbExecute(conn, insert_league)
check_league <- "SELECT * FROM league LIMIT 5;"

dbGetQuery(conn, check_league)
##   league_id                 name
## 1        AA American Association
## 2        AL      American League
## 3        FL       Federal League
## 4        NL      National League
## 5        PL       Players League

The league names previously obtained are inserted into the league table with an ID that is referenced in team and team_appearance tables.

create_appr <- "CREATE TABLE appearance_type(
                  appearance_type_id TEXT PRIMARY KEY,
                  name TEXT,
                  category TEXT
                );"
dbExecute(conn, create_appr)

insert_appr <- 'INSERT INTO appearance_type
                VALUES("O1","Batter 1","offense"), ("O2",   "Batter 2", "offense"), ("O3","Batter 3","offense"), 
                ("O4","Batter 4","offense"), ("O5", "Batter 5", "offense"), ("O6","Batter 6","offense"), ("O7","Batter 7", "offense"),                 ("O8","Batter 8","offense"), ("O9","Batter 9","offense"), ("D1","Pitcher", "defense"), ("D2", "Catcher", "defense"),                  ("D3", "1st Base", "defense"), ("D4", "2nd Base", "defense"), ("D5", "3rd Base", "defense"), 
                ("D6", "Shortstop", "defense"), ("D7", "Left Field", "defense"), ("D8", "Center Field", "defense"),
                ("D9", "Right Field", "defense"), ("D10", "Unknown Position", "defense"), ("UHP", "Home Plate", "umpire"),
                ("U1B", "First Base","umpire"), ("U2B", "Second Base", "umpire"), ("U3B", "Third Base", "umpire"),
                ("ULF", "Left Field", "umpire"), ("URF", "Right Field", "umpire"), ("MM", "Manager", "manager"),
                ("AWP", "Winning Pitcher","award"), ("ALP", "Losing Pitcher", "award"), ("ASP", "Saving Pitcher", "award"), 
                ("AWB", "Winning RBI Batter", "award"), ("PSP", "Starting Pitcher", "pitcher")'

dbExecute(conn, insert_appr)
check_appr <- "SELECT * FROM appearance_type LIMIT 5;"

dbGetQuery(conn, check_appr)
##   appearance_type_id     name category
## 1                 O1 Batter 1  offense
## 2                 O2 Batter 2  offense
## 3                 O3 Batter 3  offense
## 4                 O4 Batter 4  offense
## 5                 O5 Batter 5  offense

The appearance_type table consists of information regarding the various roles that can be taken on by a player, umpire or manager.

Team and Game tables

create_team <- "CREATE TABLE team(
                  team_id TEXT PRIMARY KEY,
                  league_id TEXT,
                  city TEXT,
                  nickname TEXT,
                  franch_id TEXT,
                  FOREIGN KEY(league_id) REFERENCES league(league_id)
                );"
dbExecute(conn, create_team)

insert_team <- "INSERT INTO team
                SELECT team_id, league, city, nickname, franch_id
                FROM team_codes;"
dbExecute(conn, insert_team)
check_team <- ("SELECT * FROM team LIMIT 5;")

dbGetQuery(conn, check_team)
##   team_id league_id      city        nickname franch_id
## 1     ALT        UA   Altoona Mountain Cities       ALT
## 2     ARI        NL   Arizona    Diamondbacks       ARI
## 3     BFN        NL   Buffalo          Bisons       BFN
## 4     BFP        PL   Buffalo          Bisons       BFP
## 5     BL1      <NA> Baltimore        Canaries       BL1

Start and end dates have been removed from team_codes as this can be obtained from game level data.

create_game <- "CREATE TABLE game (
                  game_id TEXT PRIMARY KEY,
                  date TEXT,
                  number_of_game INTEGER,
                  park_id TEXT,
                  length_outs INTEGER,
                  day BOOLEAN,
                  completion TEXT,
                  forfeit TEXT,
                  protest TEXT,
                  attendance INTEGER,
                  length_in_mins INTEGER,
                  additional_info TEXT,
                  acquisition_info TEXT,
                  FOREIGN KEY(park_id) REFERENCES park(park_id)
                );"
dbExecute(conn, create_game)

update_game_date <- 'UPDATE game
                    SET date = SUBSTR(date, 1, 4) || "-" || SUBSTR(date, 5, 2) || "-" || SUBSTR(date, 7, 2);'



dbExecute(conn, update_game_date)

insert_game <- "INSERT INTO game
                SELECT game_id, date, number_of_game, park_id, length_outs,
                  CASE WHEN day_night = 'D' THEN 1 ELSE 0 END AS day,
                  completion, forfeit, protest, attendance, length_minutes, 
                  additional_info, acquisition_info
                FROM game_log;"
dbExecute(conn, insert_game)
check_game <- "SELECT * FROM game LIMIT 5;"

dbGetQuery(conn, check_game)
##        game_id       date number_of_game park_id length_outs day completion
## 1 18710504FW10 18710504.0              0   FOR01          54   1       <NA>
## 2 18710505WS30 18710505.0              0   WAS01          54   1       <NA>
## 3 18710506RC10 18710506.0              0   RCK01          54   1       <NA>
## 4 18710508CH10 18710508.0              0   CHI01          54   1       <NA>
## 5 18710509TRO0 18710509.0              0   TRO01          54   1       <NA>
##   forfeit protest attendance length_in_mins additional_info acquisition_info
## 1    <NA>    <NA>        200            120            <NA>                Y
## 2    <NA>    <NA>       5000            145            HTBF                Y
## 3    <NA>    <NA>       1000            140            <NA>                Y
## 4    <NA>    <NA>       5000            150            <NA>                Y
## 5    <NA>    <NA>       3250            145            HTBF                Y

The game_log table originally contained many columns which repeat for home and visiting teams. These columns have been separated out into different tables and the remaining columns which are entirely relevant to the game and the game alone are left behind in the columns as seen above.

create_team_appr <- "CREATE TABLE team_appearance (
                      team_id TEXT,
                      game_id TEXT,
                      home BOOLEAN,
                      league_id TEXT,
                      score INTEGER,
                      line_score TEXT,
                      at_bats INTEGER,
                      hits INTEGER,
                      doubles INTEGER,
                      triples INTEGER,
                      homeruns INTEGER,
                      rbi INTEGER,
                      sacrifice_hits INTEGER,
                      sacrifice_flies INTEGER,
                      hit_by_pitch INTEGER,
                      walks INTEGER,
                      intentional_walks INTEGER,
                      strikeouts INTEGER,
                      stolen_bases INTEGER,
                      caught_stealing INTEGER,
                      grounded_into_double INTEGER,
                      first_catcher_interference INTEGER,
                      left_on_base INTEGER,
                      pitchers_used INTEGER,
                      individual_earned_runs INTEGER,
                      team_earned_runs INTEGER,
                      wild_pitches INTEGER,
                      balks INTEGER,
                      putouts INTEGER,
                      assists INTEGER,
                      errors INTEGER,
                      passed_balls INTEGER,
                      double_plays INTEGER,
                      triple_plays INTEGER,
                      PRIMARY KEY (team_id, game_id),
                      FOREIGN KEY (team_id) REFERENCES team(team_id),
                      FOREIGN KEY (game_id) REFERENCES game(game_id),
                      FOREIGN KEY (team_id) REFERENCES team(team_id)
                  );
"
dbExecute(conn, create_team_appr)

insert_team_appr <- "INSERT INTO team_appearance
                      SELECT
                          h_name,
                          game_id,
                          1 AS home,
                          h_league,
                          h_score,
                          h_line_score,
                          h_at_bats,
                          h_hits,
                          h_doubles,
                          h_triples,
                          h_homeruns,
                          h_rbi,
                          h_sacrifice_hits,
                          h_sacrifice_flies,
                          h_hit_by_pitch,
                          h_walks,
                          h_intentional_walks,
                          h_strikeouts,
                          h_stolen_bases,
                          h_caught_stealing,
                          h_grounded_into_double,
                          h_first_catcher_interference,
                          h_left_on_base,
                          h_pitchers_used,
                          h_individual_earned_runs,
                          h_team_earned_runs,
                          h_wild_pitches,
                          h_balks,
                          h_putouts,
                          h_assists,
                          h_errors,
                          h_passed_balls,
                          h_double_plays,
                          h_triple_plays
                      FROM game_log
                  
                  UNION
                  
                      SELECT    
                          v_name,
                          game_id,
                          0 AS home,
                          v_league,
                          v_score,
                          v_line_score,
                          v_at_bats,
                          v_hits,
                          v_doubles,
                          v_triples,
                          v_homeruns,
                          v_rbi,
                          v_sacrifice_hits,
                          v_sacrifice_flies,
                          v_hit_by_pitch,
                          v_walks,
                          v_intentional_walks,
                          v_strikeouts,
                          v_stolen_bases,
                          v_caught_stealing,
                          v_grounded_into_double,
                          v_first_catcher_interference,
                          v_left_on_base,
                          v_pitchers_used,
                          v_individual_earned_runs,
                          v_team_earned_runs,
                          v_wild_pitches,
                          v_balks,
                          v_putouts,
                          v_assists,
                          v_errors,
                          v_passed_balls,
                          v_double_plays,
                          v_triple_plays
                      from game_log;
"
dbExecute(conn, insert_team_appr)
check_team_appr <- "SELECT * FROM team_appearance 
                    WHERE game_id IN (SELECT game_id from team_appearance ORDER BY home, at_bats ASC LIMIT 2)
                    
                    UNION
                    
                    SELECT * FROM team_appearance 
                    WHERE game_id IN (SELECT game_id from team_appearance ORDER BY home, at_bats DESC LIMIT 2)
                   "
dbGetQuery(conn, check_team_appr)
##   team_id      game_id home league_id score                 line_score at_bats
## 1     ALT 18840417CNU0    0        UA     2                       <NA>      NA
## 2     ALT 18840418CNU0    0        UA     2                       <NA>      NA
## 3     BRO 19200501BSN0    0        NL     1 00001000000000000000000000      85
## 4     BSN 19200501BSN0    1        NL     1 00000100000000000000000000      86
## 5     CNU 18840417CNU0    1        UA     7                       <NA>      NA
## 6     CNU 18840418CNU0    1        UA     9                       <NA>      NA
## 7     DET 19620624DET0    1        AL     7     3030010000000000000000      82
## 8     NYA 19620624DET0    0        AL     9     6100000000000000000002      85
##   hits doubles triples homeruns rbi sacrifice_hits sacrifice_flies hit_by_pitch
## 1   NA      NA      NA       NA  NA             NA              NA           NA
## 2   NA      NA      NA       NA  NA             NA              NA           NA
## 3    9       0       0        0   1              1               0            0
## 4   15       2       1        0   1              5               0            0
## 5   NA      NA      NA       NA  NA             NA              NA           NA
## 6   NA      NA      NA       NA  NA             NA              NA           NA
## 7   19       1       1        1   7              2               0            1
## 8   20       1       0        2   9              1               1            0
##   walks intentional_walks strikeouts stolen_bases caught_stealing
## 1    NA                NA         NA           NA              NA
## 2    NA                NA         NA           NA              NA
## 3     4                NA          7            2               0
## 4     5                NA          7            0               1
## 5    NA                NA         NA           NA              NA
## 6    NA                NA         NA           NA              NA
## 7    10                 2         18            1               0
## 8     9                 1         12            1               0
##   grounded_into_double first_catcher_interference left_on_base pitchers_used
## 1                   NA                         NA           NA            NA
## 2                   NA                         NA           NA            NA
## 3                   NA                          0           11             1
## 4                   NA                          0           17             1
## 5                   NA                         NA           NA            NA
## 6                   NA                         NA           NA            NA
## 7                    2                          0           22             7
## 8                    0                          0           21             7
##   individual_earned_runs team_earned_runs wild_pitches balks putouts assists
## 1                     NA               NA           NA    NA      NA      NA
## 2                     NA               NA           NA    NA      NA      NA
## 3                      1                0            0     0      78      31
## 4                      1                0            1     0      78      41
## 5                     NA               NA           NA    NA      NA      NA
## 6                     NA               NA           NA    NA      NA      NA
## 7                      9                9            2     0      66      27
## 8                      7                7            0     0      66      20
##   errors passed_balls double_plays triple_plays
## 1     NA           NA           NA           NA
## 2     NA           NA           NA           NA
## 3      2            0            1            0
## 4      2            0            1            0
## 5     NA           NA           NA           NA
## 6     NA           NA           NA           NA
## 7      3            0            0            0
## 8      4            0            4            0

All information regarding team, appearances and scores are listed in the team_appearance table with links to game, league and team tables. We use UNION here because we are essentially reducing the width of the table. All the home team appearances are extracted and placed into the generic new columns. Then the visiting team appearances are extracted and inserted below the home team appearances thereby increasing the length of the table.

# -- Creating integer primary key which is automatically populated by SQLite -- 
create_person_appr <- "CREATE TABLE person_appearance(
                        appearance_id INTEGER PRIMARY KEY,
                        person_id TEXT,
                        team_id TEXT,
                        game_id TEXT,
                        appearance_type_id TEXT,
                        
                        FOREIGN KEY(person_id) REFERENCES person(person_id),
                        FOREIGN KEY(team_id) REFERENCES team(team_id),
                        FOREIGN KEY(game_id) REFERENCES game(game_id),
                        FOREIGN KEY(appearance_type_id) REFERENCES appearance_type(appearance_type_id)
                       );"
dbExecute(conn, create_person_appr)

# -- Inserting only the columns mentioned within INSERT clause and in that order --
# -- There are several groups that need to be extracted individually from game_log using UNION. --
insert_person_appr <- "INSERT INTO person_appearance (
                            game_id,
                            team_id,
                            person_id,
                            appearance_type_id
                        ) 
                            SELECT
                                game_id,
                                NULL,
                                hp_umpire_id,
                                'UHP'
                            FROM game_log
                            WHERE hp_umpire_id IS NOT NULL    
                        
                        UNION
                        
                            SELECT
                                game_id,
                                NULL,
                                [1b_umpire_id],
                                'U1B'
                            FROM game_log
                            WHERE '1b_umpire_id' IS NOT NULL
                        
                        UNION
                        
                            SELECT
                                game_id,
                                NULL,
                                [2b_umpire_id],
                                'U2B'
                            FROM game_log
                            WHERE [2b_umpire_id] IS NOT NULL
                        
                        UNION
                        
                            SELECT
                                game_id,
                                NULL,
                                [3b_umpire_id],
                                'U3B'
                            FROM game_log
                            WHERE [3b_umpire_id] IS NOT NULL
                        
                        UNION
                        
                            SELECT
                                game_id,
                                NULL,
                                lf_umpire_id,
                                'ULF'
                            FROM game_log
                            WHERE lf_umpire_id IS NOT NULL
                        
                        UNION
                        
                            SELECT
                                game_id,
                                NULL,
                                rf_umpire_id,
                                'URF'
                            FROM game_log
                            WHERE rf_umpire_id IS NOT NULL
                        
                        UNION
                        
                            SELECT
                                game_id,
                                v_name,
                                v_manager_id,
                                'MM'
                            FROM game_log
                            WHERE v_manager_id IS NOT NULL
                        
                        UNION
                        
                            SELECT
                                game_id,
                                h_name,
                                h_manager_id,
                                'MM'
                            FROM game_log
                            WHERE h_manager_id IS NOT NULL
                        
                        UNION
                        
                            SELECT
                                game_id,
                                CASE
                                    WHEN h_score > v_score THEN h_name
                                    ELSE v_name
                                    END,
                                winning_pitcher_id,
                                'AWP'
                            FROM game_log
                            WHERE winning_pitcher_id IS NOT NULL
                        
                        UNION
                        
                            SELECT
                                game_id,
                                CASE
                                    WHEN h_score < v_score THEN h_name
                                    ELSE v_name
                                    END,
                                losing_pitcher_id,
                                'ALP'
                            FROM game_log
                            WHERE losing_pitcher_id IS NOT NULL
                        
                        UNION
                        
                            SELECT
                                game_id,
                                CASE
                                    WHEN h_score > v_score THEN h_name
                                    ELSE v_name
                                    END,
                                saving_pitcher_id,
                                'ASP'
                            FROM game_log
                            WHERE saving_pitcher_id IS NOT NULL
                        
                        UNION
                        
                            SELECT
                                game_id,
                                CASE
                                    WHEN h_score > v_score THEN h_name
                                    ELSE v_name
                                    END,
                                winning_rbi_batter_id,
                                'AWB'
                            FROM game_log
                            WHERE winning_rbi_batter_id IS NOT NULL
                        
                        UNION
                        
                            SELECT
                                game_id,
                                v_name,
                                v_starting_pitcher_id,
                                'PSP'
                            FROM game_log
                            WHERE v_starting_pitcher_id IS NOT NULL
                        
                        UNION
                        
                            SELECT
                                game_id,
                                h_name,
                                h_starting_pitcher_id,
                                'PSP'
                            FROM game_log
                            WHERE h_starting_pitcher_id IS NOT NULL;"
                      
dbExecute(conn, insert_person_appr)
  
# -- Remaining columns include team-wise player appearances like 'v_player_1_id', 'h_player_9_id' --
# -- Inner loop goes over 9 iterations for each of the 2 outer loop iterations, once for 'c' and once for 'h' -- 
# -- Replacing placeholders for team identifier and player number at the end of each iteration and executing the
#    template string containing the INSERT command.
for (letter in c("h", "v")) {
  for (num in 1:9) {
    template <- "INSERT INTO person_appearance (
                    game_id,
                    team_id,
                    person_id,
                    appearance_type_id
                ) 
                    SELECT
                        game_id,
                        %s_name,
                        %s_player_%f_id,
                        'O%f'
                    FROM game_log
                    WHERE %s_player_%f_id IS NOT NULL
                
                UNION
                
                    SELECT
                        game_id,
                        %s_name,
                        %s_player_%f_id,
                        'D' || CAST(%s_player_%f_def_pos AS INT)
                    FROM game_log
                    WHERE %s_player_%f_id IS NOT NULL;
    "
    template <- gsub("%s", letter, template, fixed = TRUE)
    template <- gsub("%f", num, template, fixed = TRUE)
    
    dbExecute(conn, template)
  }
}
check_person_appr <- "SELECT appearance_id, person_id, team_id, game_id, appearance_type_id
                      FROM (
                            SELECT ROW_NUMBER () OVER(PARTITION BY appearance_type_id) RowNum, * 
                            FROM person_appearance)
                      WHERE RowNum = 1
                      LIMIT 6;"

dbGetQuery(conn, check_person_appr)
##   appearance_id person_id team_id      game_id appearance_type_id
## 1             4  prata101     CL1 18710504FW10                ALP
## 2           115  wrigh101     BS1 18710520BS10                ASP
## 3        308412  grimr103     CHN 19210413CHN0                AWB
## 4             7  mathb101     FW1 18710504FW10                AWP
## 5       1825376  tovac101     MIN 19680922MIN0                 D1
## 6       1829396  molib101     CHA 19800410CHA0                D10

Appearances in the game by managers, players, and umpires is detailed in the person_appearance table with the role played by them coming from appearance_type table. Once again we use UNION as we are performing a similar task where columns which were originally making the table too wide are being shrunk to increase in length.

Deleting Original Tables

Now that we have normalized tables, we can go ahead and clean the database of previously created tables. We will first check the tables in the database and select only the older tables to be deleted, looping over to delete them all.

dbListTables(conn)
##  [1] "appearance_type"   "game"              "game_log"         
##  [4] "league"            "park"              "park_codes"       
##  [7] "person"            "person_appearance" "person_codes"     
## [10] "team"              "team_appearance"   "team_codes"
delete_tables <- c("game_log", "park_codes", "team_codes", "person_codes")

for(t in delete_tables){
  drop_cmnd <- sprintf("DROP TABLE IF EXISTS %s", t) 
  dbExecute(conn, drop_cmnd)
}
dbListTables(conn)
## [1] "appearance_type"   "game"              "league"           
## [4] "park"              "person"            "person_appearance"
## [7] "team"              "team_appearance"