Data Dive Contents:

  1. Finding out the summary of the numeric and categorical columns.
  2. Some analysis on batting averages on team wise, league wise and their tendency over the period of time.
  3. Some analysis on Runs per game scored by teams wise, year wise.
  4. Finding the correlation between wins and runs scored, wins and suns allowed.
  5. Interpreting the Home Runs hit by teams in three different division (East, West, Central).
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(rmarkdown)
lahman_data = read.csv("/Users/anuragreddy/Desktop/Statistics with R/Lahmans Databse .csv")
head(lahman_data)
##   yearID lgID teamID franchID divID Rank   G Ghome  W  L DivWin WCWin LgWin
## 1   2000   AL    ANA      ANA     W    3 162    81 82 80      N     N     N
## 2   2000   AL    BAL      BAL     E    4 162    81 74 88      N     N     N
## 3   2000   AL    BOS      BOS     E    2 162    81 85 77      N     N     N
## 4   2000   AL    CHA      CHW     C    1 162    81 95 67      Y     N     N
## 5   2000   AL    CLE      CLE     C    2 162    81 90 72      N     N     N
## 6   2000   AL    DET      DET     C    3 162    81 79 83      N     N     N
##   WSWin   R   AB    H X2B X3B  HR  BB   SO  SB CS HBP SF  RA  ER  ERA CG SHO SV
## 1     N 864 5628 1574 309  34 236 608 1024  93 52  47 43 869 805 5.00  5   3 46
## 2     N 794 5549 1508 310  22 184 558  900 126 65  49 54 913 855 5.37 14   6 33
## 3     N 792 5630 1503 316  32 167 611 1019  43 30  42 48 745 683 4.23  7  12 46
## 4     N 978 5646 1615 325  33 216 591  960 119 42  53 61 839 751 4.66  5   7 43
## 5     N 950 5683 1639 310  30 221 685 1057 113 34  51 52 816 775 4.84  6   5 34
## 6     N 823 5644 1553 307  41 177 562  982  83 38  43 49 827 755 4.71  6   6 44
##   IPouts   HA HRA BBA  SOA   E  DP    FP              name
## 1   4344 1534 228 662  846 134 182 0.978    Anaheim Angels
## 2   4300 1547 202 665 1017 116 151 0.981 Baltimore Orioles
## 3   4358 1433 173 498 1121 109 120 0.982    Boston Red Sox
## 4   4351 1509 195 614 1037 133 190 0.978 Chicago White Sox
## 5   4327 1511 173 666 1213  72 147 0.988 Cleveland Indians
## 6   4330 1583 177 496  978 105 171 0.983    Detroit Tigers
##                          park attendance BPF PPF teamIDBR teamIDlahman45
## 1  Edison International Field    2066982 102 103      ANA            ANA
## 2 Oriole Park at Camden Yards    3297031  95  96      BAL            BAL
## 3              Fenway Park II    2585895 104 103      BOS            BOS
## 4            Comiskey Park II    1947799 102 102      CHW            CHA
## 5                Jacobs Field    3456278 101 100      CLE            CLE
## 6               Comerica Park    2438617  95  95      DET            DET
##   teamIDretro
## 1         ANA
## 2         BAL
## 3         BOS
## 4         CHA
## 5         CLE
## 6         DET

Question 1: Summary of Numeric and Categorical columns.

#Summary of At-Bats in lahmandatabase
summary(lahman_data$AB)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1752    5462    5527    5368    5582    5770
#Summary of Home Runs in lahmandatabase
summary(lahman_data$HR)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    51.0   144.0   168.0   169.6   198.0   307.0
#Summary of divisions in lahmandatabase from 2000-2024
table(lahman_data$divID)
## 
##   C   E   W 
## 233 220 207
summary(lahman_data$divID)
##    Length     Class      Mode 
##       660 character character
#Summary of leagues in lahmandatabase from 2000-2024
table(lahman_data$lgID)
## 
##  AL  NL 
## 317 343
#All numeric columns summary
lahman_data |>
  summary(across(where(is.numeric)))
##      yearID         lgID              teamID            franchID        
##  Min.   :2000   Length:660         Length:660         Length:660        
##  1st Qu.:2005   Class :character   Class :character   Class :character  
##  Median :2010   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2011                                                           
##  3rd Qu.:2016                                                           
##  Max.   :2022                                                           
##     divID                Rank             G             Ghome      
##  Length:660         Min.   :1.000   Min.   : 58.0   Min.   :24.00  
##  Class :character   1st Qu.:2.000   1st Qu.:162.0   1st Qu.:81.00  
##  Mode  :character   Median :3.000   Median :162.0   Median :81.00  
##                     Mean   :3.008   Mean   :157.3   Mean   :78.64  
##                     3rd Qu.:4.000   3rd Qu.:162.0   3rd Qu.:81.00  
##                     Max.   :6.000   Max.   :163.0   Max.   :84.00  
##        W                L             DivWin             WCWin          
##  Min.   : 19.00   Min.   : 17.00   Length:660         Length:660        
##  1st Qu.: 71.00   1st Qu.: 70.00   Class :character   Class :character  
##  Median : 80.00   Median : 79.00   Mode  :character   Mode  :character  
##  Mean   : 78.65   Mean   : 78.65                                        
##  3rd Qu.: 90.00   3rd Qu.: 89.00                                        
##  Max.   :116.00   Max.   :119.00                                        
##     LgWin              WSWin                 R               AB      
##  Length:660         Length:660         Min.   :219.0   Min.   :1752  
##  Class :character   Class :character   1st Qu.:671.0   1st Qu.:5462  
##  Mode  :character   Mode  :character   Median :730.0   Median :5527  
##                                        Mean   :718.1   Mean   :5368  
##                                        3rd Qu.:790.0   3rd Qu.:5582  
##                                        Max.   :978.0   Max.   :5770  
##        H             X2B             X3B              HR              BB       
##  Min.   : 390   Min.   : 73.0   Min.   : 3.00   Min.   : 51.0   Min.   :147.0  
##  1st Qu.:1356   1st Qu.:264.8   1st Qu.:21.00   1st Qu.:144.0   1st Qu.:464.0  
##  Median :1426   Median :282.5   Median :28.00   Median :168.0   Median :514.0  
##  Mean   :1387   Mean   :276.8   Mean   :28.04   Mean   :169.6   Mean   :506.5  
##  3rd Qu.:1492   3rd Qu.:302.0   3rd Qu.:35.00   3rd Qu.:198.0   3rd Qu.:564.0  
##  Max.   :1667   Max.   :376.0   Max.   :61.00   Max.   :307.0   Max.   :775.0  
##        SO             SB               CS             HBP        
##  Min.   : 440   Min.   : 14.00   Min.   : 3.00   Min.   : 10.00  
##  1st Qu.:1032   1st Qu.: 65.75   1st Qu.:27.00   1st Qu.: 47.00  
##  Median :1146   Median : 87.00   Median :34.00   Median : 55.00  
##  Mean   :1145   Mean   : 88.35   Mean   :34.56   Mean   : 56.52  
##  3rd Qu.:1287   3rd Qu.:109.00   3rd Qu.:42.00   3rd Qu.: 66.00  
##  Max.   :1596   Max.   :200.00   Max.   :74.00   Max.   :112.00  
##        SF              RA              ER             ERA       
##  Min.   : 7.00   Min.   :209.0   Min.   :181.0   Min.   :2.800  
##  1st Qu.:36.00   1st Qu.:666.0   1st Qu.:609.0   1st Qu.:3.860  
##  Median :42.00   Median :729.0   Median :671.5   Median :4.210  
##  Mean   :42.04   Mean   :718.1   Mean   :660.5   Mean   :4.249  
##  3rd Qu.:49.00   3rd Qu.:797.0   3rd Qu.:733.0   3rd Qu.:4.633  
##  Max.   :75.00   Max.   :981.0   Max.   :913.0   Max.   :5.840  
##        CG              SHO               SV            IPouts    
##  Min.   : 0.000   Min.   : 0.000   Min.   : 6.00   Min.   :1419  
##  1st Qu.: 2.000   1st Qu.: 6.000   1st Qu.:35.00   1st Qu.:4302  
##  Median : 4.000   Median : 9.000   Median :40.00   Median :4329  
##  Mean   : 4.323   Mean   : 9.174   Mean   :39.53   Mean   :4205  
##  3rd Qu.: 6.000   3rd Qu.:12.000   3rd Qu.:45.00   3rd Qu.:4357  
##  Max.   :18.000   Max.   :23.000   Max.   :66.00   Max.   :4485  
##        HA            HRA             BBA             SOA             E         
##  Min.   : 376   Min.   : 62.0   Min.   :145.0   Min.   : 393   Min.   : 20.00  
##  1st Qu.:1356   1st Qu.:151.0   1st Qu.:466.8   1st Qu.:1034   1st Qu.: 86.00  
##  Median :1424   Median :170.0   Median :516.0   Median :1153   Median : 98.00  
##  Mean   :1387   Mean   :169.6   Mean   :506.5   Mean   :1145   Mean   : 96.47  
##  3rd Qu.:1493   3rd Qu.:190.0   3rd Qu.:560.0   3rd Qu.:1270   3rd Qu.:109.00  
##  Max.   :1683   Max.   :305.0   Max.   :728.0   Max.   :1671   Max.   :145.00  
##        DP              FP             name               park          
##  Min.   : 33.0   Min.   :0.9760   Length:660         Length:660        
##  1st Qu.:132.0   1st Qu.:0.9820   Class :character   Class :character  
##  Median :145.0   Median :0.9840   Mode  :character   Mode  :character  
##  Mean   :141.9   Mean   :0.9835                                        
##  3rd Qu.:158.0   3rd Qu.:0.9850                                        
##  Max.   :204.0   Max.   :0.9910                                        
##    attendance           BPF             PPF          teamIDBR        
##  Min.   :      0   Min.   : 88.0   Min.   : 88.0   Length:660        
##  1st Qu.:1767692   1st Qu.: 97.0   1st Qu.: 97.0   Class :character  
##  Median :2321000   Median :100.0   Median :100.0   Mode  :character  
##  Mean   :2275828   Mean   :100.1   Mean   :100.1                     
##  3rd Qu.:2915270   3rd Qu.:103.0   3rd Qu.:103.0                     
##  Max.   :4298655   Max.   :125.0   Max.   :125.0                     
##  teamIDlahman45     teamIDretro       
##  Length:660         Length:660        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
#Let's find the batting average of teams by grouping yearwise and league wise.
lahman_data$lgID
##   [1] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "NL"
##  [16] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
##  [31] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "NL"
##  [46] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
##  [61] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "NL"
##  [76] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
##  [91] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "NL"
## [106] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
## [121] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "NL"
## [136] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
## [151] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "NL"
## [166] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
## [181] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "NL"
## [196] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
## [211] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "NL"
## [226] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
## [241] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "NL"
## [256] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
## [271] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "NL"
## [286] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
## [301] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "NL"
## [316] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
## [331] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "NL"
## [346] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
## [361] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "NL"
## [376] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
## [391] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL"
## [406] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
## [421] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL"
## [436] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
## [451] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL"
## [466] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
## [481] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL"
## [496] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
## [511] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL"
## [526] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
## [541] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL"
## [556] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
## [571] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL"
## [586] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
## [601] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL"
## [616] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"
## [631] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL"
## [646] "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL" "NL"

Question 2: Lets find out the Batting Average of teams group by year and league wise.

p <- lahman_data |>
  group_by(teamID,yearID,lgID) |>
  summarise(Batting_Avg = sum(H)/sum(AB), .groups = "keep") |>
  arrange(desc(Batting_Avg))

p
## # A tibble: 660 × 4
## # Groups:   teamID, yearID, lgID [660]
##    teamID yearID lgID  Batting_Avg
##    <chr>   <int> <chr>       <dbl>
##  1 COL      2000 NL          0.294
##  2 COL      2001 NL          0.292
##  3 NYA      2007 AL          0.290
##  4 BOS      2003 AL          0.289
##  5 CLE      2000 AL          0.288
##  6 SEA      2001 AL          0.288
##  7 KCA      2000 AL          0.288
##  8 MIN      2006 AL          0.287
##  9 DET      2007 AL          0.287
## 10 SEA      2007 AL          0.287
## # ℹ 650 more rows

Insights: Colorado is the only team from national league has topped in the batting average in consecutive years 2000 and 2001.

Question 3: Lets find out which league has best batting average and whats the tendency over the years.

# Lets visualize batting average of AL vs NL
k <- lahman_data|>
  group_by(yearID,lgID)|>
  summarize(Batting_Avg = sum(H)/sum(AB),.groups = 'keep')

k
## # A tibble: 44 × 3
## # Groups:   yearID, lgID [44]
##    yearID lgID  Batting_Avg
##     <int> <chr>       <dbl>
##  1   2000 AL          0.276
##  2   2000 NL          0.266
##  3   2001 AL          0.267
##  4   2001 NL          0.261
##  5   2002 AL          0.264
##  6   2002 NL          0.259
##  7   2003 AL          0.267
##  8   2003 NL          0.262
##  9   2004 AL          0.270
## 10   2004 NL          0.263
## # ℹ 34 more rows
k |>
  ggplot(aes(x=yearID,y=Batting_Avg,group=lgID,colour=lgID))+
  geom_point()+
  stat_smooth(se=FALSE)+
  theme_classic()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Insights: We can interpret from the above visualization that, Overall, the batting average of the teams has significantly decreased over a period of time. In terms of batting average American league has better batters than national league players.

Question 4: Lets find out which teams has the highest Runs per Game metric and in which year (2000-2020)

# Lets find out which team has the highest Runs per Game and in whcih year(2000-2020)
lahman_data |>
  mutate(RPG = R/G) |>
  arrange(desc(RPG))|>
  select(yearID,teamID,RPG)
##     yearID teamID      RPG
## 1     2000    CHA 6.037037
## 2     2000    COL 5.975309
## 3     2007    NYA 5.975309
## 4     2003    BOS 5.932099
## 5     2000    OAK 5.881988
## 6     2000    CLE 5.864198
## 7     2004    BOS 5.858025
## 8     2019    NYA 5.820988
## 9     2020    LAN 5.816667
## 10    2020    ATL 5.800000
## 11    2019    MIN 5.796296
## 12    2000    HOU 5.790123
## 13    2006    NYA 5.740741
## 14    2001    SEA 5.722222
## 15    2000    SFN 5.709877
## 16    2001    COL 5.697531
## 17    2019    HOU 5.679012
## 18    2009    NYA 5.648148
## 19    2005    BOS 5.617284
## 20    2000    SEA 5.598765
## 21    2003    ATL 5.598765
## 22    2002    NYA 5.571429
## 23    2008    TEX 5.561728
## 24    2019    BOS 5.561728
## 25    2001    CLE 5.537037
## 26    2004    NYA 5.537037
## 27    2017    HOU 5.530864
## 28    2003    TOR 5.518519
## 29    2007    PHI 5.506173
## 30    2015    TOR 5.500000
## 31    2001    TEX 5.493827
## 32    2000    SLN 5.475309
## 33    2007    DET 5.475309
## 34    2005    NYA 5.469136
## 35    2019    LAN 5.469136
## 36    2001    OAK 5.456790
## 37    2009    LAA 5.450617
## 38    2000    KCA 5.425926
## 39    2016    BOS 5.419753
## 40    2020    SDN 5.416667
## 41    2000    NYA 5.409938
## 42    2003    SLN 5.407407
## 43    2011    BOS 5.401235
## 44    2019    WAS 5.388889
## 45    2009    BOS 5.382716
## 46    2003    NYA 5.380368
## 47    2006    CLE 5.370370
## 48    2006    CHA 5.358025
## 49    2007    BOS 5.351852
## 50    2011    NYA 5.351852
## 51    2004    CHA 5.339506
## 52    2005    TEX 5.339506
## 53    2006    PHI 5.339506
## 54    2000    ANA 5.333333
## 55    2021    HOU 5.327160
## 56    2000    TOR 5.314815
## 57    2008    CHN 5.310559
## 58    2004    TEX 5.308642
## 59    2002    BOS 5.302469
## 60    2010    NYA 5.302469
## 61    2004    CLE 5.296296
## 62    2017    NYA 5.296296
## 63    2021    TBA 5.290123
## 64    2002    CHA 5.283951
## 65    2004    SLN 5.277778
## 66    2011    TEX 5.277778
## 67    2019    ATL 5.277778
## 68    2007    COL 5.276074
## 69    2003    COL 5.265432
## 70    2013    BOS 5.265432
## 71    2002    ANA 5.253086
## 72    2020    NYA 5.250000
## 73    2004    SFN 5.246914
## 74    2006    ATL 5.240741
## 75    2000    TEX 5.234568
## 76    2001    HOU 5.228395
## 77    2022    LAN 5.228395
## 78    2021    TOR 5.222222
## 79    2008    BOS 5.216049
## 80    2016    COL 5.216049
## 81    2019    OAK 5.216049
## 82    2002    TEX 5.203704
## 83    2004    BAL 5.197531
## 84    2004    PHI 5.185185
## 85    2003    KCA 5.160494
## 86    2004    ANA 5.160494
## 87    2006    TEX 5.154321
## 88    2019    COL 5.154321
## 89    2006    NYN 5.148148
## 90    2004    COL 5.141975
## 91    2021    LAN 5.123457
## 92    2021    BOS 5.117284
## 93    2004    DET 5.104938
## 94    2020    CHA 5.100000
## 95    2020    PHI 5.100000
## 96    2003    TEX 5.098765
## 97    2017    COL 5.086420
## 98    2008    MIN 5.085890
## 99    2000    DET 5.080247
## 100   2006    DET 5.074074
## 101   2007    LAA 5.074074
## 102   2017    CHN 5.074074
## 103   2008    DET 5.067901
## 104   2006    BOS 5.061728
## 105   2006    LAN 5.061728
## 106   2009    PHI 5.061728
## 107   2000    CIN 5.061350
## 108   2002    ARI 5.055556
## 109   2017    WAS 5.055556
## 110   2001    ARI 5.049383
## 111   2010    BOS 5.049383
## 112   2017    CLE 5.049383
## 113   2007    TEX 5.037037
## 114   2020    TOR 5.033333
## 115   2017    MIN 5.030864
## 116   2005    CIN 5.030675
## 117   2001    SLN 5.024691
## 118   2002    SEA 5.024691
## 119   2019    CHN 5.024691
## 120   2002    TOR 5.018519
## 121   2006    COL 5.018519
## 122   2019    ARI 5.018519
## 123   2017    ARI 5.012346
## 124   2009    MIN 5.012270
## 125   2007    CLE 5.006173
## 126   2000    ATL 5.000000
## 127   2007    ATL 5.000000
## 128   2019    TEX 5.000000
## 129   2006    TOR 4.993827
## 130   2001    NYA 4.993789
## 131   2012    TEX 4.987654
## 132   2016    CHN 4.987654
## 133   2020    SFN 4.983333
## 134   2000    NYN 4.981481
## 135   2005    PHI 4.981481
## 136   2022    NYA 4.981481
## 137   2008    CHA 4.975460
## 138   2003    HOU 4.969136
## 139   2005    SLN 4.969136
## 140   2008    CLE 4.969136
## 141   2007    NYN 4.962963
## 142   2009    COL 4.962963
## 143   2012    NYA 4.962963
## 144   2021    SFN 4.962963
## 145   2004    ATL 4.956790
## 146   2004    HOU 4.956790
## 147   2009    TBA 4.956790
## 148   2010    TBA 4.950617
## 149   2003    MIN 4.944444
## 150   2006    MIN 4.944444
## 151   2007    MIL 4.944444
## 152   2002    OAK 4.938272
## 153   2001    SFN 4.932099
## 154   2008    NYN 4.932099
## 155   2008    PHI 4.932099
## 156   2017    TEX 4.932099
## 157   2000    LAN 4.925926
## 158   2001    CHA 4.925926
## 159   2009    TOR 4.925926
## 160   2013    DET 4.913580
## 161   2021    CHA 4.913580
## 162   2003    SEA 4.907407
## 163   2021    ATL 4.906832
## 164   2000    BAL 4.901235
## 165   2007    SEA 4.901235
## 166   2020    LAA 4.900000
## 167   2000    PIT 4.895062
## 168   2004    OAK 4.895062
## 169   2000    BOS 4.888889
## 170   2000    ARI 4.888889
## 171   2020    WAS 4.883333
## 172   2003    CHA 4.882716
## 173   2003    PHI 4.882716
## 174   2019    NYN 4.882716
## 175   2005    CLE 4.876543
## 176   2007    FLO 4.876543
## 177   2010    CIN 4.876543
## 178   2001    SDN 4.870370
## 179   2004    CHN 4.870370
## 180   2008    NYA 4.870370
## 181   2022    ATL 4.870370
## 182   2020    BOS 4.866667
## 183   2002    SLN 4.858025
## 184   2010    TEX 4.858025
## 185   2011    DET 4.858025
## 186   2008    BAL 4.857143
## 187   2021    CIN 4.851852
## 188   2006    SLN 4.850932
## 189   2009    MIL 4.845679
## 190   2017    BOS 4.845679
## 191   2009    TEX 4.839506
## 192   2002    SFN 4.833333
## 193   2007    CIN 4.833333
## 194   2013    SLN 4.833333
## 195   2007    TBA 4.827160
## 196   2016    CLE 4.826087
## 197   2010    MIN 4.820988
## 198   2020    TBA 4.816667
## 199   2004    MIN 4.814815
## 200   2009    LAN 4.814815
## 201   2008    SLN 4.808642
## 202   2016    SLN 4.808642
## 203   2002    COL 4.802469
## 204   2017    MIA 4.802469
## 205   2001    CHN 4.796296
## 206   2001    BOS 4.795031
## 207   2012    MIL 4.790123
## 208   2005    TOR 4.783951
## 209   2022    TOR 4.783951
## 210   2008    FLO 4.782609
## 211   2008    TBA 4.777778
## 212   2019    PHI 4.777778
## 213   2006    ARI 4.771605
## 214   2009    CLE 4.771605
## 215   2014    LAA 4.771605
## 216   2002    MIN 4.770186
## 217   2020    NYN 4.766667
## 218   2005    OAK 4.765432
## 219   2009    FLO 4.765432
## 220   2010    PHI 4.765432
## 221   2022    NYN 4.765432
## 222   2022    SLN 4.765432
## 223   2001    MIN 4.759259
## 224   2006    OAK 4.759259
## 225   2010    COL 4.753086
## 226   2017    LAN 4.753086
## 227   2005    ATL 4.746914
## 228   2019    CLE 4.746914
## 229   2019    LAA 4.746914
## 230   2019    TBA 4.746914
## 231   2019    MIL 4.746914
## 232   2003    OAK 4.740741
## 233   2004    SDN 4.740741
## 234   2006    BAL 4.740741
## 235   2016    SEA 4.740741
## 236   2001    TOR 4.734568
## 237   2012    LAA 4.734568
## 238   2013    OAK 4.734568
## 239   2006    LAA 4.728395
## 240   2008    LAA 4.722222
## 241   2012    SLN 4.722222
## 242   2016    TEX 4.722222
## 243   2000    CHN 4.716049
## 244   2015    NYA 4.716049
## 245   2019    SLN 4.716049
## 246   2016    WAS 4.709877
## 247   2011    SLN 4.703704
## 248   2004    LAN 4.697531
## 249   2005    LAA 4.697531
## 250   2017    SLN 4.697531
## 251   2003    SFN 4.689441
## 252   2009    OAK 4.685185
## 253   2016    TOR 4.685185
## 254   2001    LAN 4.679012
## 255   2006    FLO 4.679012
## 256   2012    COL 4.679012
## 257   2019    SEA 4.679012
## 258   2019    PIT 4.679012
## 259   2006    KCA 4.672840
## 260   2014    DET 4.672840
## 261   2006    SEA 4.666667
## 262   2007    BAL 4.666667
## 263   2010    TOR 4.660494
## 264   2014    COL 4.660494
## 265   2016    DET 4.658385
## 266   2020    HOU 4.650000
## 267   2003    PIT 4.648148
## 268   2007    TOR 4.648148
## 269   2008    ATL 4.648148
## 270   2017    CIN 4.648148
## 271   2000    SDN 4.641975
## 272   2007    CHN 4.641975
## 273   2010    CHA 4.641975
## 274   2016    ARI 4.641975
## 275   2003    FLO 4.635802
## 276   2010    DET 4.635802
## 277   2015    TEX 4.635802
## 278   2006    SFN 4.633540
## 279   2004    CIN 4.629630
## 280   2005    TBA 4.629630
## 281   2008    MIL 4.629630
## 282   2010    MIL 4.629630
## 283   2017    SEA 4.629630
## 284   2002    HOU 4.623457
## 285   2006    CIN 4.623457
## 286   2000    MIN 4.617284
## 287   2012    CHA 4.617284
## 288   2015    BOS 4.617284
## 289   2008    COL 4.611111
## 290   2022    PHI 4.611111
## 291   2001    PHI 4.604938
## 292   2006    WAS 4.604938
## 293   2013    BAL 4.598765
## 294   2013    CLE 4.598765
## 295   2016    BAL 4.592593
## 296   2021    COL 4.590062
## 297   2011    TOR 4.586420
## 298   2017    BAL 4.586420
## 299   2021    OAK 4.586420
## 300   2020    COL 4.583333
## 301   2001    FLO 4.580247
## 302   2005    CHA 4.574074
## 303   2007    OAK 4.574074
## 304   2009    BAL 4.574074
## 305   2001    MIL 4.567901
## 306   2005    COL 4.567901
## 307   2020    BAL 4.566667
## 308   2020    OAK 4.566667
## 309   2002    CLE 4.561728
## 310   2017    OAK 4.561728
## 311   2003    BAL 4.558282
## 312   2009    DET 4.558282
## 313   2000    MON 4.555556
## 314   2010    ATL 4.555556
## 315   2021    MIL 4.555556
## 316   2000    TBA 4.552795
## 317   2002    KCA 4.549383
## 318   2015    COL 4.549383
## 319   2022    HOU 4.549383
## 320   2007    SDN 4.546012
## 321   2003    ANA 4.543210
## 322   2010    SLN 4.543210
## 323   2000    FLO 4.540373
## 324   2000    MIL 4.539877
## 325   2001    CIN 4.537037
## 326   2002    MON 4.537037
## 327   2006    HOU 4.537037
## 328   2007    LAN 4.537037
## 329   2008    PIT 4.537037
## 330   2009    ATL 4.537037
## 331   2011    CIN 4.537037
## 332   2011    COL 4.537037
## 333   2017    DET 4.537037
## 334   2017    NYN 4.537037
## 335   2022    BOS 4.537037
## 336   2012    BOS 4.530864
## 337   2012    ARI 4.530864
## 338   2021    PHI 4.530864
## 339   2013    LAA 4.524691
## 340   2017    ATL 4.518519
## 341   2017    MIL 4.518519
## 342   2006    SDN 4.512346
## 343   2011    ARI 4.512346
## 344   2012    WAS 4.512346
## 345   2006    MIL 4.506173
## 346   2009    SLN 4.506173
## 347   2011    KCA 4.506173
## 348   2001    KCA 4.500000
## 349   2001    ATL 4.500000
## 350   2005    BAL 4.500000
## 351   2014    OAK 4.500000
## 352   2015    HOU 4.500000
## 353   2016    PIT 4.500000
## 354   2019    BAL 4.500000
## 355   2021    MIN 4.500000
## 356   2021    SDN 4.500000
## 357   2020    MIN 4.483333
## 358   2020    ARI 4.483333
## 359   2005    MIL 4.481481
## 360   2012    DET 4.481481
## 361   2019    TOR 4.481481
## 362   2013    TEX 4.478528
## 363   2007    SLN 4.475309
## 364   2016    LAN 4.475309
## 365   2022    MIL 4.475309
## 366   2001    DET 4.469136
## 367   2003    CHN 4.469136
## 368   2007    PIT 4.469136
## 369   2009    CHA 4.469136
## 370   2015    KCA 4.469136
## 371   2016    HOU 4.469136
## 372   2021    WAS 4.469136
## 373   2004    TOR 4.465839
## 374   2005    DET 4.462963
## 375   2007    HOU 4.462963
## 376   2014    TOR 4.462963
## 377   2021    LAA 4.462963
## 378   2005    NYN 4.456790
## 379   2016    MIN 4.456790
## 380   2011    MIL 4.450617
## 381   2004    KCA 4.444444
## 382   2008    ARI 4.444444
## 383   2009    ARI 4.444444
## 384   2015    ARI 4.444444
## 385   2010    FLO 4.438272
## 386   2004    TBA 4.434783
## 387   2004    FLO 4.432099
## 388   2007    MIN 4.432099
## 389   2011    NYN 4.432099
## 390   2012    SFN 4.432099
## 391   2014    LAN 4.432099
## 392   2003    ARI 4.425926
## 393   2005    FLO 4.425926
## 394   2016    LAA 4.425926
## 395   2021    CLE 4.425926
## 396   2008    HOU 4.422360
## 397   2006    CHN 4.419753
## 398   2012    TOR 4.419753
## 399   2016    CIN 4.419753
## 400   2022    SFN 4.419753
## 401   2020    CHN 4.416667
## 402   2003    TBA 4.413580
## 403   2014    MIN 4.413580
## 404   2016    SFN 4.413580
## 405   2002    PHI 4.409938
## 406   2003    MIL 4.407407
## 407   2008    TOR 4.407407
## 408   2002    LAN 4.401235
## 409   2010    ARI 4.401235
## 410   2011    PHI 4.401235
## 411   2012    OAK 4.401235
## 412   2015    BAL 4.401235
## 413   2002    ATL 4.397516
## 414   2019    CHA 4.397516
## 415   2007    ARI 4.395062
## 416   2012    BAL 4.395062
## 417   2013    TOR 4.395062
## 418   2009    CHN 4.391304
## 419   2003    MON 4.388889
## 420   2021    NYA 4.388889
## 421   2020    MIA 4.383333
## 422   2009    WAS 4.382716
## 423   2017    LAA 4.382716
## 424   2002    CIN 4.376543
## 425   2000    PHI 4.370370
## 426   2011    BAL 4.370370
## 427   2011    TBA 4.364198
## 428   2022    TEX 4.364198
## 429   2002    CHN 4.358025
## 430   2007    KCA 4.358025
## 431   2013    COL 4.358025
## 432   2017    CHA 4.358025
## 433   2021    SLN 4.358025
## 434   2014    BAL 4.351852
## 435   2021    CHN 4.351852
## 436   2022    SDN 4.351852
## 437   2008    CIN 4.345679
## 438   2011    CLE 4.345679
## 439   2005    CHN 4.339506
## 440   2015    WAS 4.339506
## 441   2017    KCA 4.333333
## 442   2022    ARI 4.333333
## 443   2005    KCA 4.327160
## 444   2012    MIN 4.327160
## 445   2019    CIN 4.327160
## 446   2008    LAN 4.320988
## 447   2012    ATL 4.320988
## 448   2002    FLO 4.314815
## 449   2003    CLE 4.314815
## 450   2005    SEA 4.314815
## 451   2004    SEA 4.308642
## 452   2013    CIN 4.308642
## 453   2022    CLE 4.308642
## 454   2022    COL 4.308642
## 455   2010    SFN 4.302469
## 456   2012    TBA 4.302469
## 457   2015    PIT 4.302469
## 458   2021    DET 4.302469
## 459   2021    SEA 4.302469
## 460   2005    ARI 4.296296
## 461   2015    MIN 4.296296
## 462   2015    SFN 4.296296
## 463   2022    MIN 4.296296
## 464   2013    TBA 4.294479
## 465   2020    DET 4.293103
## 466   2002    NYN 4.285714
## 467   2003    CIN 4.283951
## 468   2015    OAK 4.283951
## 469   2017    TBA 4.283951
## 470   2015    DET 4.279503
## 471   2007    CHA 4.277778
## 472   2017    TOR 4.277778
## 473   2001    ANA 4.265432
## 474   2006    PIT 4.265432
## 475   2008    KCA 4.265432
## 476   2019    KCA 4.265432
## 477   2017    PHI 4.259259
## 478   2022    SEA 4.259259
## 479   2006    TBA 4.253086
## 480   2015    CHN 4.253086
## 481   2005    HOU 4.251534
## 482   2005    MIN 4.246914
## 483   2013    ATL 4.246914
## 484   2001    BAL 4.240741
## 485   2009    KCA 4.234568
## 486   2014    WAS 4.234568
## 487   2016    CHA 4.234568
## 488   2016    SDN 4.234568
## 489   2021    KCA 4.234568
## 490   2022    CHA 4.234568
## 491   2020    SEA 4.233333
## 492   2005    LAN 4.228395
## 493   2010    CHN 4.228395
## 494   2013    ARI 4.228395
## 495   2004    PIT 4.223602
## 496   2004    NYN 4.222222
## 497   2005    SDN 4.222222
## 498   2012    PHI 4.222222
## 499   2007    SFN 4.216049
## 500   2015    NYN 4.216049
## 501   2014    PIT 4.209877
## 502   2019    SDN 4.209877
## 503   2010    LAA 4.203704
## 504   2005    PIT 4.197531
## 505   2016    NYA 4.197531
## 506   2021    ARI 4.191358
## 507   2003    SDN 4.185185
## 508   2019    SFN 4.185185
## 509   2002    TBA 4.180124
## 510   2010    KCA 4.172840
## 511   2012    KCA 4.172840
## 512   2016    KCA 4.166667
## 513   2022    BAL 4.160494
## 514   2015    CLE 4.155280
## 515   2007    WAS 4.154321
## 516   2009    CIN 4.154321
## 517   2001    TBA 4.148148
## 518   2016    TBA 4.148148
## 519   2008    SEA 4.141975
## 520   2009    NYN 4.141975
## 521   2016    MIL 4.141975
## 522   2016    NYN 4.141975
## 523   2020    SLN 4.137931
## 524   2001    MON 4.135802
## 525   2020    CLE 4.133333
## 526   2020    KCA 4.133333
## 527   2012    CIN 4.129630
## 528   2014    CLE 4.129630
## 529   2017    PIT 4.123457
## 530   2002    BAL 4.117284
## 531   2010    LAN 4.117284
## 532   2011    LAA 4.117284
## 533   2012    CLE 4.117284
## 534   2015    LAN 4.117284
## 535   2020    MIL 4.116667
## 536   2022    TBA 4.111111
## 537   2010    SDN 4.104938
## 538   2014    SFN 4.104938
## 539   2010    OAK 4.092593
## 540   2002    SDN 4.086420
## 541   2015    LAA 4.080247
## 542   2014    CHA 4.074074
## 543   2016    MIA 4.068323
## 544   2021    BAL 4.067901
## 545   2001    PIT 4.055556
## 546   2009    SFN 4.055556
## 547   2022    CHN 4.055556
## 548   2020    CIN 4.050000
## 549   2010    NYN 4.049383
## 550   2013    WAS 4.049383
## 551   2015    SEA 4.049383
## 552   2010    WAS 4.043210
## 553   2015    MIL 4.043210
## 554   2011    CHA 4.037037
## 555   2011    CHN 4.037037
## 556   2016    ATL 4.031056
## 557   2016    OAK 4.030864
## 558   2012    PIT 4.018519
## 559   2012    SDN 4.018519
## 560   2014    KCA 4.018519
## 561   2008    OAK 4.012422
## 562   2012    NYN 4.012346
## 563   2013    NYA 4.012346
## 564   2014    MIL 4.012346
## 565   2015    SDN 4.012346
## 566   2005    SFN 4.006173
## 567   2013    LAN 4.006173
## 568   2011    LAN 4.000000
## 569   2013    KCA 4.000000
## 570   2022    CIN 4.000000
## 571   2015    SLN 3.993827
## 572   2010    CLE 3.987654
## 573   2003    NYN 3.987578
## 574   2011    OAK 3.981481
## 575   2014    MIA 3.981481
## 576   2002    PIT 3.981366
## 577   2008    WAS 3.981366
## 578   2015    TBA 3.975309
## 579   2009    HOU 3.969136
## 580   2001    NYN 3.962963
## 581   2011    ATL 3.956790
## 582   2008    SFN 3.950617
## 583   2009    SEA 3.950617
## 584   2013    MIL 3.950617
## 585   2015    CIN 3.950617
## 586   2022    KCA 3.950617
## 587   2009    PIT 3.950311
## 588   2005    WAS 3.944444
## 589   2017    SFN 3.944444
## 590   2009    SDN 3.938272
## 591   2004    MIL 3.937888
## 592   2008    SDN 3.932099
## 593   2012    LAN 3.932099
## 594   2014    TEX 3.932099
## 595   2021    NYN 3.925926
## 596   2004    MON 3.919753
## 597   2013    PIT 3.913580
## 598   2014    BOS 3.913580
## 599   2014    SEA 3.913580
## 600   2014    NYA 3.907407
## 601   2013    SFN 3.882716
## 602   2014    HOU 3.882716
## 603   2014    NYN 3.882716
## 604   2011    WAS 3.875776
## 605   2002    MIL 3.870370
## 606   2015    PHI 3.864198
## 607   2011    FLO 3.858025
## 608   2021    TEX 3.858025
## 609   2013    SEA 3.851852
## 610   2021    MIA 3.845679
## 611   2022    LAA 3.845679
## 612   2015    CHA 3.839506
## 613   2011    MIN 3.820988
## 614   2012    SEA 3.820988
## 615   2013    NYN 3.820988
## 616   2014    PHI 3.820988
## 617   2014    SLN 3.820988
## 618   2013    SDN 3.814815
## 619   2004    ARI 3.796296
## 620   2011    HOU 3.796296
## 621   2014    ARI 3.796296
## 622   2019    MIA 3.796296
## 623   2013    MIN 3.790123
## 624   2014    CHN 3.790123
## 625   2010    BAL 3.783951
## 626   2012    CHN 3.783951
## 627   2015    MIA 3.783951
## 628   2014    TBA 3.777778
## 629   2010    HOU 3.771605
## 630   2011    PIT 3.765432
## 631   2013    HOU 3.765432
## 632   2013    PHI 3.765432
## 633   2016    PHI 3.765432
## 634   2012    MIA 3.759259
## 635   2021    PIT 3.759259
## 636   2020    TEX 3.733333
## 637   2017    SDN 3.728395
## 638   2022    WAS 3.722222
## 639   2013    CHN 3.716049
## 640   2013    CHA 3.691358
## 641   2014    CIN 3.672840
## 642   2011    SDN 3.660494
## 643   2020    PIT 3.650000
## 644   2003    DET 3.648148
## 645   2022    PIT 3.648148
## 646   2010    PIT 3.623457
## 647   2022    MIA 3.617284
## 648   2019    DET 3.614907
## 649   2012    HOU 3.598765
## 650   2002    DET 3.571429
## 651   2003    LAN 3.543210
## 652   2014    ATL 3.537037
## 653   2015    ATL 3.537037
## 654   2011    SFN 3.518519
## 655   2022    OAK 3.506173
## 656   2022    DET 3.438272
## 657   2011    SEA 3.432099
## 658   2014    SDN 3.302469
## 659   2010    SEA 3.166667
## 660   2013    MIA 3.166667

Insights: Chicago White Sox has scored 6 runs per game on average in year 2000. Which is the highest for any team over last 22 seasons in MLB.

Question 5: Lets visualize the distribution of Runs Per Game over last 22 years.

#Lets visualize the distribution of runs per game from 2000-2022.
mean_RPG = sum(lahman_data$R)/sum(lahman_data$G)


data <- lahman_data|>
  group_by(yearID)|>
  summarise(Runs_Per_Game = sum(R)/sum(G)) |>
  ggplot()+
  geom_line(aes(x=yearID,y=Runs_Per_Game))+
  geom_hline(yintercept = mean_RPG , color = "red")+
  annotate("Text",x=2005,y=4.5,label="Avearge Runs_per_Game - 4.6", color='red')+
  ggtitle("Runs per game scored year wise")+ 
  theme_classic()
data

Insights: The average runs per game scored over last 22 years was 4.6. The highest runs per game was seen in early 21st century and it tend to decrease over the course of time.

Question 6: Lets find the factors which have high correlation with wins using correlation plots

lahman_data |>
  ggplot() +
  geom_point(aes(x=R,y=W))+
  annotate("rect",xmin=200,xmax=360,ymin=10,ymax=45,alpha=0.2)+
  theme_classic()

Note: The bottom left corner are the games played in covid period. The season was stopped abruptly. So, we will try to remove the covid year 2020 and try working on correlation.

lahman_data |>
  filter(yearID!=2020)|>
  ggplot(aes(x=R,y=W))+
  geom_point(color='red')+
  geom_smooth(method='lm',formula = y ~ x,color='black')+
  ggtitle("Correlation between Wins and Runs Scored")+
  theme_classic()

Insights: There is a positive correlation between runs scored and wins in Major League Baseball. Let’s now find out the value of the r.

#Method to find the correlation between the two numeric values.
cor.test(lahman_data$W,lahman_data$R)
## 
##  Pearson's product-moment correlation
## 
## data:  lahman_data$W and lahman_data$R
## t = 31.96, df = 658, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7480862 0.8080990
## sample estimates:
##       cor 
## 0.7798785

Insights: Value of r is 0.78. There is a strong positive relationship between Runs scored and wins in major league baseball games. On similar lines lets find the correlation between runs allowed and wins.

#Lets try to find the correlation between runs allowed and wins.
lahman_data |>
  filter(yearID!=2020)|>
  ggplot(aes(x=RA,y=W))+
  geom_point(color='red')+
  geom_smooth(method='lm',formula = y ~ x,color='black')+
  ggtitle("Correlation between Wins and Runs Allowed")+
  theme_classic()

Insights: There is a negative correlation between runs allowed and wins in Major League Baseball.

Question 7: Lets try to visualize the home runs scored by teams in different divisions (E,W,C) and differentiate them by league wise (AL,NL).

lahman_data |>
  mutate(Division=as.factor(divID),League=as.factor(lgID))|>
  ggplot() +
  geom_boxplot(mapping = aes(x = Division, y = HR, color = League)) +
  scale_color_brewer(palette = "Dark2") +
  theme_minimal()

Insights: The ability to hit home runs is quite similar in all the division wise players in both the leagues. They are mostly bounded between 100-250 home runs. Box plot is the best option to visualize the continuous vs categorical variables.