library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
Data_set <- "/Users/ba/Documents/IUPUI/Masters/First Sem/Statistics/Dataset/PitchingPost.csv"
Pitching_Data <- read.csv(Data_set)
check_infinite <- function(x) {
  sum(is.infinite(x))
}

# Apply the function to each column of the dataframe and store the results in a list
infinite_values <- sapply(Pitching_Data, check_infinite)

# Print the list of columns with infinite values
(infinite_values)
## playerID   yearID    round   teamID     lgID        W        L        G 
##        0        0        0        0        0        0        0        0 
##       GS       CG      SHO       SV   IPouts        H       ER       HR 
##        0        0        0        0        0        0        0        0 
##       BB       SO    BAOpp      ERA      IBB       WP      HBP       BK 
##        0        0        0       23        0        0        0        0 
##      BFP       GF        R       SH       SF     GIDP 
##        0        0        0        0        0        0
New_Pitching_Data <-
  Pitching_Data |>
  filter(is.finite(ERA),
         is.finite(BAOpp))
numeric_data <- New_Pitching_Data[, sapply(Pitching_Data, is.numeric)]
cor_data <- data.frame(cor(numeric_data))
cor_data
##              yearID            W            L             G           GS
## yearID  1.000000000 -0.050212396 -0.052090048 -0.0214620740 -0.070448510
## W      -0.050212396  1.000000000 -0.103221405  0.0555063160  0.340169744
## L      -0.052090048 -0.103221405  1.000000000 -0.0436488094  0.402686945
## G      -0.021462074  0.055506316 -0.043648809  1.0000000000 -0.273029508
## GS     -0.070448510  0.340169744  0.402686945 -0.2730295081  1.000000000
## CG     -0.068282884  0.226924608 -0.012066891 -0.0352883948  0.157565061
## SHO    -0.059470447  0.183891374 -0.027529851 -0.0243084371  0.114566262
## SV     -0.030115228 -0.054424984 -0.066365710  0.2661500977 -0.155344759
## IPouts -0.131116138  0.496732113  0.294328048  0.0724929048  0.815655815
## H      -0.169189493  0.253258386  0.465697665 -0.0055447162  0.760580419
## ER     -0.106480668  0.025007446  0.565215108 -0.0360084768  0.592981936
## HR     -0.021543282  0.065154718  0.364651854 -0.0291934365  0.432200144
## BB     -0.117707221  0.167347978  0.288808310  0.0445155865  0.543928596
## SO      0.020988616  0.429536775  0.223484256  0.1150729098  0.640193777
## BAOpp  -0.067987257 -0.128224439  0.221432413 -0.0685256898  0.079413398
## ERA    -0.003768949 -0.140849340  0.185137607 -0.0881824481  0.002398787
## IBB    -0.113667854 -0.011638732  0.171680934  0.1354769915  0.081247874
## WP     -0.026006264  0.041025595  0.103221277  0.0503233282  0.145192786
## HBP    -0.038969904  0.081939723  0.136661405  0.0022559596  0.246539274
## BK     -0.008883173 -0.001767919 -0.009150962  0.0008872018 -0.002425797
## BFP    -0.149712280  0.444443010  0.371454088  0.0564587684  0.852393787
## GF     -0.049520577 -0.050229816 -0.086852124  0.4141473525 -0.332228073
## R      -0.116389697  0.020183964  0.591240510 -0.0308366481  0.601579716
## SH     -0.175680175  0.017257464  0.184172488  0.0139020944  0.204023814
## SF     -0.061444430  0.046125237  0.113842504  0.0291292893  0.162425051
## GIDP   -0.093936255  0.195022966  0.116536495  0.0328025748  0.342711068
##                  CG          SHO           SV      IPouts            H
## yearID -0.068282884 -0.059470447 -0.030115228 -0.13111614 -0.169189493
## W       0.226924608  0.183891374 -0.054424984  0.49673211  0.253258386
## L      -0.012066891 -0.027529851 -0.066365710  0.29432805  0.465697665
## G      -0.035288395 -0.024308437  0.266150098  0.07249290 -0.005544716
## GS      0.157565061  0.114566262 -0.155344759  0.81565581  0.760580419
## CG      1.000000000  0.750349212 -0.017901010  0.27057058  0.095643517
## SHO     0.750349212  1.000000000 -0.009358419  0.20130135  0.050260794
## SV     -0.017901010 -0.009358419  1.000000000 -0.01634374 -0.083741988
## IPouts  0.270570577  0.201301354 -0.016343738  1.00000000  0.742876718
## H       0.095643517  0.050260794 -0.083741988  0.74287672  1.000000000
## ER     -0.005147906 -0.024993337 -0.108636814  0.45441928  0.770723180
## HR     -0.002797421 -0.012510200 -0.082386099  0.36279913  0.515032664
## BB      0.029987917  0.022977288 -0.044800607  0.51536179  0.458636848
## SO      0.230058876  0.185634233  0.031302327  0.82311451  0.549567701
## BAOpp  -0.056423612 -0.050873696 -0.090839076 -0.08794663  0.419619987
## ERA    -0.041758665 -0.035067663 -0.081184397 -0.17020671  0.143738016
## IBB    -0.028190430 -0.017092496 -0.001174654  0.12344983  0.178322133
## WP      0.004550018 -0.007630957 -0.022267387  0.15009934  0.159986760
## HBP     0.010126819  0.013395273 -0.011949976  0.24216674  0.207868657
## BK     -0.007903506 -0.005953507 -0.011897616 -0.00268376  0.017543010
## BFP     0.227322991  0.165203523 -0.036841439  0.97705997  0.843416148
## GF     -0.045457845 -0.029999843  0.660887651 -0.13445362 -0.192581472
## R      -0.004623240 -0.028330902 -0.110176137  0.46409839  0.782145872
## SH      0.062936960  0.032387351 -0.036839978  0.23540584  0.248634829
## SF     -0.012932080 -0.022326524 -0.033787219  0.15497031  0.206188771
## GIDP    0.056292160  0.030383208 -0.043145389  0.41069372  0.381399069
##                  ER           HR          BB           SO       BAOpp
## yearID -0.106480668 -0.021543282 -0.11770722  0.020988616 -0.06798726
## W       0.025007446  0.065154718  0.16734798  0.429536775 -0.12822444
## L       0.565215108  0.364651854  0.28880831  0.223484256  0.22143241
## G      -0.036008477 -0.029193437  0.04451559  0.115072910 -0.06852569
## GS      0.592981936  0.432200144  0.54392860  0.640193777  0.07941340
## CG     -0.005147906 -0.002797421  0.02998792  0.230058876 -0.05642361
## SHO    -0.024993337 -0.012510200  0.02297729  0.185634233 -0.05087370
## SV     -0.108636814 -0.082386099 -0.04480061  0.031302327 -0.09083908
## IPouts  0.454419284  0.362799135  0.51536179  0.823114512 -0.08794663
## H       0.770723180  0.515032664  0.45863685  0.549567701  0.41961999
## ER      1.000000000  0.632336432  0.46459396  0.335611560  0.42598256
## HR      0.632336432  1.000000000  0.23872430  0.302423953  0.23297887
## BB      0.464593964  0.238724302  1.00000000  0.420804209  0.04773711
## SO      0.335611560  0.302423953  0.42080421  1.000000000 -0.13217437
## BAOpp   0.425982557  0.232978869  0.04773711 -0.132174374  1.00000000
## ERA     0.435857459  0.213986451  0.05907848 -0.154321199  0.59357720
## IBB     0.185134698  0.028845692  0.34252914  0.089974300  0.08656159
## WP      0.173325765  0.052311236  0.23158219  0.155701810  0.04325997
## HBP     0.221205015  0.090493387  0.17527004  0.180870762  0.01480214
## BK      0.035470287  0.019274586  0.01949963 -0.001562896  0.02665303
## BFP     0.582505504  0.422627459  0.60801865  0.794441660  0.04031646
## GF     -0.188893580 -0.125074720 -0.13777013 -0.068398794 -0.10298625
## R       0.972143741  0.621626654  0.47136999  0.340927322  0.42904965
## SH      0.176290454  0.049283900  0.22324705  0.109549501  0.08905299
## SF      0.208275458  0.034135306  0.15458517  0.057736074  0.11906166
## GIDP    0.198674095  0.142789633  0.26586669  0.225342145  0.05301241
##                 ERA          IBB           WP           HBP            BK
## yearID -0.003768949 -0.113667854 -0.026006264 -0.0389699044 -0.0088831727
## W      -0.140849340 -0.011638732  0.041025595  0.0819397234 -0.0017679193
## L       0.185137607  0.171680934  0.103221277  0.1366614046 -0.0091509615
## G      -0.088182448  0.135476992  0.050323328  0.0022559596  0.0008872018
## GS      0.002398787  0.081247874  0.145192786  0.2465392739 -0.0024257967
## CG     -0.041758665 -0.028190430  0.004550018  0.0101268187 -0.0079035062
## SHO    -0.035067663 -0.017092496 -0.007630957  0.0133952734 -0.0059535069
## SV     -0.081184397 -0.001174654 -0.022267387 -0.0119499761 -0.0118976162
## IPouts -0.170206714  0.123449830  0.150099343  0.2421667439 -0.0026837603
## H       0.143738016  0.178322133  0.159986760  0.2078686568  0.0175430098
## ER      0.435857459  0.185134698  0.173325765  0.2212050149  0.0354702868
## HR      0.213986451  0.028845692  0.052311236  0.0904933873  0.0192745863
## BB      0.059078482  0.342529141  0.231582193  0.1752700431  0.0194996311
## SO     -0.154321199  0.089974300  0.155701810  0.1808707618 -0.0015628960
## BAOpp   0.593577203  0.086561589  0.043259967  0.0148021355  0.0266530290
## ERA     1.000000000  0.044750603  0.024946332  0.0461792372  0.0658163871
## IBB     0.044750603  1.000000000  0.078820126  0.0398933636 -0.0039107172
## WP      0.024946332  0.078820126  1.000000000  0.0502298870  0.0290074575
## HBP     0.046179237  0.039893364  0.050229887  1.0000000000 -0.0009097134
## BK      0.065816387 -0.003910717  0.029007457 -0.0009097134  1.0000000000
## BFP    -0.076911219  0.174947603  0.180030007  0.2780079769  0.0050726142
## GF     -0.097784223  0.022563952 -0.039025174 -0.0696157275 -0.0177007632
## R       0.419678906  0.192026375  0.171484247  0.2228088059  0.0351583866
## SH     -0.013619455  0.176641724  0.041871301  0.1184557970 -0.0187674470
## SF      0.048695501  0.063890644  0.054113405  0.0713645780  0.0074273980
## GIDP   -0.082751824  0.071956422  0.020434986  0.1480078495 -0.0178934140
##                 BFP          GF           R          SH           SF
## yearID -0.149712280 -0.04952058 -0.11638970 -0.17568018 -0.061444430
## W       0.444443010 -0.05022982  0.02018396  0.01725746  0.046125237
## L       0.371454088 -0.08685212  0.59124051  0.18417249  0.113842504
## G       0.056458768  0.41414735 -0.03083665  0.01390209  0.029129289
## GS      0.852393787 -0.33222807  0.60157972  0.20402381  0.162425051
## CG      0.227322991 -0.04545784 -0.00462324  0.06293696 -0.012932080
## SHO     0.165203523 -0.02999984 -0.02833090  0.03238735 -0.022326524
## SV     -0.036841439  0.66088765 -0.11017614 -0.03683998 -0.033787219
## IPouts  0.977059974 -0.13445362  0.46409839  0.23540584  0.154970308
## H       0.843416148 -0.19258147  0.78214587  0.24863483  0.206188771
## ER      0.582505504 -0.18889358  0.97214374  0.17629045  0.208275458
## HR      0.422627459 -0.12507472  0.62162665  0.04928390  0.034135306
## BB      0.608018651 -0.13777013  0.47136999  0.22324705  0.154585165
## SO      0.794441660 -0.06839879  0.34092732  0.10954950  0.057736074
## BAOpp   0.040316456 -0.10298625  0.42904965  0.08905299  0.119061657
## ERA    -0.076911219 -0.09778422  0.41967891 -0.01361946  0.048695501
## IBB     0.174947603  0.02256395  0.19202637  0.17664172  0.063890644
## WP      0.180030007 -0.03902517  0.17148425  0.04187130  0.054113405
## HBP     0.278007977 -0.06961573  0.22280881  0.11845580  0.071364578
## BK      0.005072614 -0.01770076  0.03515839 -0.01876745  0.007427398
## BFP     1.000000000 -0.16325874  0.59752187  0.26390709  0.186492568
## GF     -0.163258738  1.00000000 -0.18920449 -0.04847096 -0.036179065
## R       0.597521869 -0.18920449  1.00000000  0.20707421  0.217640284
## SH      0.263907092 -0.04847096  0.20707421  1.00000000  0.107603400
## SF      0.186492568 -0.03617906  0.21764028  0.10760340  1.000000000
## GIDP    0.391816301 -0.07213071  0.20018894  0.10938006  0.070921102
##               GIDP
## yearID -0.09393625
## W       0.19502297
## L       0.11653649
## G       0.03280257
## GS      0.34271107
## CG      0.05629216
## SHO     0.03038321
## SV     -0.04314539
## IPouts  0.41069372
## H       0.38139907
## ER      0.19867409
## HR      0.14278963
## BB      0.26586669
## SO      0.22534214
## BAOpp   0.05301241
## ERA    -0.08275182
## IBB     0.07195642
## WP      0.02043499
## HBP     0.14800785
## BK     -0.01789341
## BFP     0.39181630
## GF     -0.07213071
## R       0.20018894
## SH      0.10938006
## SF      0.07092110
## GIDP    1.00000000
New_Pitching_Data |>
  ggplot(aes(x=ERA,y=BAOpp))+
  geom_point(color='blue')+
  stat_smooth(se=TRUE,color='black')+
  labs(x="Earned Runs Average",y="Opponents Batting Average",title="ERA vs BAOpp")+
  theme_classic()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

paste('Correlation between ERA and BAOpp:' ,round(cor(New_Pitching_Data$ERA, New_Pitching_Data$BAOpp),2))
## [1] "Correlation between ERA and BAOpp: 0.59"

Interpretation: The observed trend indicates a positive association between teams exhibiting elevated opponents’ batting averages and correspondingly higher earned run averages (ERAs). This empirical relationship implies that teams facing opponents with superior batting proficiency tend to yield more earned runs per game on average. Such an outcome infers a potential diminishment in the effectiveness of pitching performance within these teams. The inference drawn suggests a correlation between defensive vulnerability, as gauged by opponents’ batting averages, and the overall efficacy of pitching strategies employed by the respective teams.

New_Pitching_Data |>
  ggplot(aes(x=IPouts,y=SO))+
  geom_point(color='green')+
   stat_smooth(se=TRUE,color='black')+
  labs(x="Innning Pitched / 3",y="No.of Strike Outs",title="IPOuts vs SO")+
  theme_classic()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

cor(New_Pitching_Data$IPouts,New_Pitching_Data$SO)
## [1] 0.8231145

Interpretation: The observed correlation suggests a positive relationship between the duration of a pitcher’s participation in a game, measured by the number of innings pitched, and their propensity to accumulate strikeouts. This finding implies that as pitchers extend their presence on the field, they are afforded increased opportunities to engage in pitching exchanges with batters, consequently enhancing their potential for registering strikeouts. Such a correlation underscores the interplay between temporal involvement and performance outcomes in the context of pitching proficiency, elucidating the importance of sustained game presence in facilitating the attainment of favorable pitching statistics, particularly strikeouts.

Confidence Interval:

set.seed(123)
Bootstrap <- function(Data, func = mean, n = 10^3) {
  func_values <- c(NULL)
  
  for (i in 1:n) {
    
    x_sample <- sample(Data, size = length(Data), replace = TRUE)
    
    func_values <- c(func_values, func(x_sample))
  }
  
  return(func_values)
}
Bootstrap_mean <-Bootstrap(New_Pitching_Data$ER , fun=mean,n=10^3)
head(Bootstrap_mean)
## [1] 1.651270 1.691248 1.632631 1.670989 1.677742 1.728795
ggplot() +
  geom_histogram(mapping = aes(x = Bootstrap_mean),
                 colour='white') +
  theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Confidence Interval using Standard Error of Bootstrapped means.

library(boot)
Avg_ER <- mean(New_Pitching_Data$ER)
P <- .95  # % confidence
n <- length(New_Pitching_Data$ER)
Bootstrap_SE <- sd(Bootstrap_mean)
# t-statistic with n-1 degrees of freedom
t_star <- qt(p = (1 - P)/2, df=n - 1, lower.tail=FALSE)

# t half-width
CI_t <- t_star * Bootstrap_SE

CI <- round(c(Avg_ER - CI_t, Avg_ER + CI_t),2)

paste("The true mean is contained 95% of times in:", CI[1],"to",CI[2])
## [1] "The true mean is contained 95% of times in: 1.61 to 1.73"

Interpretation: A 95% confidence interval is constructed to estimate the true population mean based on the available sample data. The resulting interval, from 1.61 to 1.73, serves as a plausible range within which the true mean value of the parameter under consideration is expected to reside with a confidence level of 95%. This interval elucidates the uncertainty inherent in estimating population parameters from finite sample data, providing valuable insights into the precision of the estimated mean.

SO_DF <- New_Pitching_Data|>
  group_by(playerID) |>
  summarise(SO = mean(SO)) |>
  select(playerID, SO)
SO_DF$Performance <- cut(SO_DF$SO,breaks = c(-0.1,mean(SO_DF$SO),Inf),labels = c("Below Average","Above Average"))
SO_DF
## # A tibble: 1,005 × 3
##    playerID     SO Performance  
##    <chr>     <dbl> <fct>        
##  1 abadfe01  0     Below Average
##  2 abbotpa01 2.25  Below Average
##  3 abreubr01 4.75  Above Average
##  4 aceveal01 0.667 Below Average
##  5 adamja01  2     Below Average
##  6 adamsau02 0.5   Below Average
##  7 adamsmi03 2     Below Average
##  8 affelje01 1.82  Below Average
##  9 alberma01 0     Below Average
## 10 albural01 2.33  Below Average
## # ℹ 995 more rows
mean(SO_DF$SO)
## [1] 2.879028
SO_DF |>
  count(Performance)
## # A tibble: 2 × 2
##   Performance       n
##   <fct>         <int>
## 1 Below Average   572
## 2 Above Average   433
SO_DF |>
  count(Performance)|>
  ggplot(aes(x=Performance,y=n,fill=Performance))+
  geom_bar(stat='identity')+
  labs(title = "Distribution of pitchers based on Strike Out performance")+
  theme_classic()

Interpretation: The mean strikeout value, a measure of the average number of strikeouts recorded per pitcher, is computed as 2.879. Subsequently, it is observed that within this population, there exists a dichotomy in terms of strikeout performance. Specifically, a total of 433 pitchers exhibit strikeout values exceeding the computed mean, indicating an above-average proficiency in this aspect of pitching. Conversely, approximately 572 pitchers demonstrate strikeout values below the mean, suggesting a comparatively lower proficiency in achieving strikeouts.