## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: ggplot2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.1.0     ✔ tidyr     1.3.1
## ✔ readr     2.1.5     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'vcd' was built under R version 4.5.2
## Loading required package: grid
mlb.data <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vQ17dehFvWJR0GTao13g-XTVBxlpKXo5SqVAZ1ospu_iSoNRqbJNoFg52V5X5ig3WQfNySVOyt_AQlR/pub?output=csv")
head(mlb.data)
##       Player_Name Hits Walks Strikeouts Extra_Base_Hits
## 1      Mike Trout  106    87        178              41
## 2    Mookie Betts  152    61         68              45
## 3       Juan Soto  152   127        137              64
## 4 Freddie Freeman  164    60        128              65
## 5    José Ramírez  168    66         74              67
## 6     Trea Turner  179    43        107              53

Hits

stem.leaf(mlb.data$Hits)
## 1 | 2: represents 12
##  leaf unit: 1
##             n: 50
## LO: 45
##    2     6 | 9
##    5     7 | 289
##    6     8 | 5
##    8     9 | 58
##   12    10 | 3678
##   15    11 | 268
##   21    12 | 044599
##   (6)   13 | 012346
##   23    14 | 1466
##   19    15 | 12223456
##   11    16 | 489
##    8    17 | 002499
##    2    18 | 14
fivenum(mlb.data$Hits)
## [1]  45.0 112.0 133.5 155.0 184.0

\[F_L=112, F_U=155; dF=43\\STEP=1.5(43)=64.5\] \[fence_{lower}=112-64.5=47.5; fence_{upper}=155+64.5=219.5\] One mild outlier at 45 hits. No extreme outliers.

bins <- seq(45, 185, 14)
bin.mids <- (bins[-1]+bins[-length(bins)])/2
ggplot(mlb.data, aes(Hits)) +
  geom_histogram(breaks = bins,
                 fill="white",
                 color="red")

The stem-and-leaf plot and histogram both indicate a negative skew in the number of hits for our data set. This skew may be due to some of the players in the sample not playing the entire season.

hist(mlb.data$Hits, main="Raw Data")

(letter.values <- lval(mlb.data$Hits))
##   depth    lo    hi   mids spreads
## M  25.5 133.5 133.5 133.50     0.0
## H  13.0 112.0 155.0 133.50    43.0
## E   7.0  95.0 170.0 132.50    75.0
## D   4.0  78.0 179.0 128.50   101.0
## C   2.5  70.5 180.0 125.25   109.5
## B   1.0  45.0 184.0 114.50   139.0
select(letter.values, mids)
##     mids
## M 133.50
## H 133.50
## E 132.50
## D 128.50
## C 125.25
## B 114.50
letter.values %>% mutate(LV=1:6) %>%
  ggplot(aes(LV, mids)) +
  geom_point() + ggtitle("Raw Data")

hist((mlb.data$Hits)^1.5, main="P=1.5")

(pow1letter.values <- lval((mlb.data$Hits)^1.5))
##   depth        lo       hi     mids   spreads
## M  25.5 1542.4965 1542.496 1542.496    0.0000
## H  13.0 1185.2966 1929.734 1557.516  744.4379
## E   7.0  925.9455 2216.529 1571.237 1290.5834
## D   4.0  688.8773 2394.857 1541.867 1705.9794
## C   2.5  592.0487 2414.981 1503.515 1822.9327
## B   1.0  301.8692 2495.897 1398.883 2194.0283
select(pow1letter.values,mids)
##       mids
## M 1542.496
## H 1557.516
## E 1571.237
## D 1541.867
## C 1503.515
## B 1398.883
pow1letter.values %>% mutate(LV=1:6) %>%
  ggplot(aes(LV, mids)) +
  geom_point() + ggtitle("P=1.5")

The sequence of midsummaries for the raw data is decreasing, indicating left skewness. Due to the left skew, a power transformation of p=1.5 was applied in an attempt to achieve symmetry. The midsummaries after the transformation no longer follow this trend, which indicates the transformed data is more symmetric.

Hits vs Strikeouts Correlation

ggplot(mlb.data,
       aes(Hits,Strikeouts)) +
  geom_point() +
  xlab("Hits") + ylab("Strikeouts")

lm_fit <- lm(Strikeouts~Hits, data=mlb.data)
summary(lm_fit)
## 
## Call:
## lm(formula = Strikeouts ~ Hits, data = mlb.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -75.326 -25.609  -0.008  28.051  72.447 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  58.5681    21.6316   2.708  0.00936 **
## Hits          0.4515     0.1580   2.858  0.00628 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36.42 on 48 degrees of freedom
## Multiple R-squared:  0.1455, Adjusted R-squared:  0.1276 
## F-statistic:  8.17 on 1 and 48 DF,  p-value: 0.006282
ggplot(mlb.data, aes(Hits, Strikeouts)) +
  geom_point() +
  geom_smooth(method="lm", se=FALSE) +
  xlab("Hits") + ylab("Strikeouts")
## `geom_smooth()` using formula = 'y ~ x'

mlb.data$ls_resid <- resid(lm_fit)
plot(mlb.data$Hits, mlb.data$ls_resid, main="Least Squares Residuals", ylab="Residuals", xlab="Hits")
abline(h=0)

#### Strikeouts

stem.leaf(mlb.data$Strikeouts)
## 1 | 2: represents 120
##  leaf unit: 10
##             n: 50
##     1     t | 3
##     2     f | 4
##     9     s | 6666677
##    15    0. | 888899
##    24    1* | 000000111
##   (11)    t | 22222233333
##    15     f | 4445555
##     8     s | 666677
##     2    1. | 9
##     1    2* | 0
fivenum(mlb.data$Strikeouts)
## [1]  33  88 123 149 201

\[F_L=88, F_U=149; dF=61\\STEP=1.5(61)=91.5\] \[fence_{lower}=88-91.5=-3.5;\\fence_{upper}=149+91.5=240.5\] Using these values as fences, there are no outliers in the Strikeout data.

bins2 <- seq(32, 202, 17)
bin.mids2 <- (bins2[-1]+bins[-length(bins)])/2

ggplot(mlb.data, aes(Strikeouts)) +
  geom_histogram(breaks=bins2,
                 fill="white",
                 color="red")

Because the strikeout data seems roughly symmetric, we will attempt to fit a Gaussian curve to the data. We first need the Gaussian parameters: \[m=\frac{88+149}{2}=118.5\\s=\frac{149-88}{1.349}\approx45.2\]

s <- fit.gaussian(mlb.data$Strikeouts, bins2, 118.5, 45.2)
options(digits=3)
(df <- data.frame(Mid=bin.mids2, d=s$counts, Prob=s$probs, e=s$expected, Residual=s$residual))
##      Mid d   Prob    e Residual
## 1   47.0 2 0.0342 1.71  0.10571
## 2   62.5 3 0.0606 3.03 -0.00931
## 3   78.0 6 0.0934 4.67  0.28857
## 4   93.5 5 0.1251 6.25 -0.26446
## 5  109.0 8 0.1456 7.28  0.13027
## 6  124.5 8 0.1474 7.37  0.11358
## 7  140.0 7 0.1298 6.49  0.09853
## 8  155.5 7 0.0993 4.97  0.41715
## 9  171.0 2 0.0661 3.31 -0.40397
## 10 186.5 2 0.0383 1.91  0.03102
p <- ggplot(mlb.data, aes(Strikeouts)) +
  geom_histogram(breaks = bins2)
out <- ggplot_build(p)$data[[1]]
select(out, count, x, xmin, xmax)
##    count     x xmin xmax
## 1      2  40.5   32   49
## 2      3  57.5   49   66
## 3      6  74.5   66   83
## 4      5  91.5   83  100
## 5      8 108.5  100  117
## 6      8 125.5  117  134
## 7      7 142.5  134  151
## 8      7 159.5  151  168
## 9      2 176.5  168  185
## 10     2 193.5  185  202
ggplot(out, aes(x, count)) +
  geom_col() +
  geom_line(data = df, 
            aes(bin.mids2, e), color="red")