Presentation R and RDBMS

Using the RSQLite and DBI packages

options(warn=-1)
library(RSQLite)
library(DBI)
library(plyr)
library(ggplot2)
library(reshape2)

Connecting to the a SQLite version of the Lahman Baseball Database

con = dbConnect(RSQLite::SQLite(),'lahman2016.sqlite')

List Tables

dbListTables(con)
##  [1] "AllstarFull"         "Appearances"         "AwardsManagers"     
##  [4] "AwardsPlayers"       "AwardsShareManagers" "AwardsSharePlayers" 
##  [7] "Batting"             "BattingPost"         "CollegePlaying"     
## [10] "Fielding"            "FieldingOF"          "FieldingOFsplit"    
## [13] "FieldingPost"        "HallOfFame"          "HomeGames"          
## [16] "Managers"            "ManagersHalf"        "Master"             
## [19] "Parks"               "Pitching"            "PitchingPost"       
## [22] "Salaries"            "Schools"             "SeriesPost"         
## [25] "Teams"               "TeamsFranchises"     "TeamsHalf"

Master Table contains bibliographic information about the players

dbListFields(con, "Master")
##  [1] "playerID"     "birthYear"    "birthMonth"   "birthDay"    
##  [5] "birthCountry" "birthState"   "birthCity"    "deathYear"   
##  [9] "deathMonth"   "deathDay"     "deathCountry" "deathState"  
## [13] "deathCity"    "nameFirst"    "nameLast"     "nameGiven"   
## [17] "weight"       "height"       "bats"         "throws"      
## [21] "debut"        "finalGame"    "retroID"      "bbrefID"

Select Pitchers with 3,000 or more strikeouts by using a SQL statement to join the Master and Pitching tables

pitchers<- dbGetQuery(con, "SELECT Master.nameLast, Master.nameGiven, sum(Pitching.SO) AS 'Strikeouts', sum(Pitching.W) As 'Wins'
                                            FROM Master  
                                            INNER JOIN Pitching ON Pitching.playerID = Master.playerID
                                            GROUP BY Master.nameLast, Master.nameGiven 
                                            HAVING sum(Pitching.SO) >= 3000
                                            Order BY sum(Pitching.SO) desc, sum(Pitching.W) desc ")
pitchers
##     nameLast       nameGiven Strikeouts Wins
## 1       Ryan      Lynn Nolan       5714  324
## 2    Johnson   Randall David       4875  303
## 3    Clemens   William Roger       4672  354
## 4    Carlton   Steven Norman       4136  329
## 5   Blyleven     Rik Aalbert       3701  287
## 6     Seaver   George Thomas       3640  311
## 7     Sutton   Donald Howard       3574  324
## 8      Perry Gaylord Jackson       3534  314
## 9    Johnson    Walter Perry       3509  417
## 10    Maddux    Gregory Alan       3371  355
## 11    Niekro    Philip Henry       3342  318
## 12   Jenkins Ferguson Arthur       3192  284
## 13  Martinez     Pedro Jaime       3154  219
## 14    Gibson          Robert       3117  251
## 15 Schilling Curtis Montague       3116  216
## 16    Smoltz     John Andrew       3084  213
top_3_pitchers<- dbGetQuery(con, "SELECT Master.nameLast || ', ' || Master.nameGiven AS 'Name', sum(Pitching.SO) AS 'Strikeouts',     sum(Pitching.W) As                                  'Wins'
                                            FROM Master  
                                            INNER JOIN Pitching ON Pitching.playerID = Master.playerID
                                            GROUP BY Master.nameLast, Master.nameGiven 
                                            HAVING sum(Pitching.SO) >= 3000
                                            Order BY sum(Pitching.SO) desc, sum(Pitching.W) desc 
                                            Limit 3")
top_3_pitchers
##                     Name Strikeouts Wins
## 1       Ryan, Lynn Nolan       5714  324
## 2 Johnson, Randall David       4875  303
## 3 Clemens, William Roger       4672  354
data.m <- melt(top_3_pitchers, id.vars='Name')

# plot everything
ggplot(data.m, aes(Name, value)) +   
  geom_bar(aes(fill = variable), position = "dodge", stat="identity")+
geom_text(aes(label=value), vjust=1.6, hjust=-.25, color="blue",
            position = position_dodge(1.0), size=3.5)