## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: ggplot2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.1.0 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'vcd' was built under R version 4.5.2
## Loading required package: grid
mlb.data <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vQ17dehFvWJR0GTao13g-XTVBxlpKXo5SqVAZ1ospu_iSoNRqbJNoFg52V5X5ig3WQfNySVOyt_AQlR/pub?output=csv")
head(mlb.data)
## Player_Name Hits Walks Strikeouts Extra_Base_Hits
## 1 Mike Trout 106 87 178 41
## 2 Mookie Betts 152 61 68 45
## 3 Juan Soto 152 127 137 64
## 4 Freddie Freeman 164 60 128 65
## 5 José RamÃrez 168 66 74 67
## 6 Trea Turner 179 43 107 53
stem.leaf(mlb.data$Hits)
## 1 | 2: represents 12
## leaf unit: 1
## n: 50
## LO: 45
## 2 6 | 9
## 5 7 | 289
## 6 8 | 5
## 8 9 | 58
## 12 10 | 3678
## 15 11 | 268
## 21 12 | 044599
## (6) 13 | 012346
## 23 14 | 1466
## 19 15 | 12223456
## 11 16 | 489
## 8 17 | 002499
## 2 18 | 14
fivenum(mlb.data$Hits)
## [1] 45.0 112.0 133.5 155.0 184.0
\[F_L=112, F_U=155; dF=43\\STEP=1.5(43)=64.5\] \[fence_{lower}=112-64.5=47.5; fence_{upper}=155+64.5=219.5\] One mild outlier at 45 hits. No extreme outliers.
bins <- seq(45, 185, 14)
bin.mids <- (bins[-1]+bins[-length(bins)])/2
ggplot(mlb.data, aes(Hits)) +
geom_histogram(breaks = bins,
fill="white",
color="red")
The stem-and-leaf plot and histogram both indicate a negative skew in
the number of hits for our data set. This skew may be due to some of the
players in the sample not playing the entire season.
hist(mlb.data$Hits, main="Raw Data")
(letter.values <- lval(mlb.data$Hits))
## depth lo hi mids spreads
## M 25.5 133.5 133.5 133.50 0.0
## H 13.0 112.0 155.0 133.50 43.0
## E 7.0 95.0 170.0 132.50 75.0
## D 4.0 78.0 179.0 128.50 101.0
## C 2.5 70.5 180.0 125.25 109.5
## B 1.0 45.0 184.0 114.50 139.0
select(letter.values, mids)
## mids
## M 133.50
## H 133.50
## E 132.50
## D 128.50
## C 125.25
## B 114.50
letter.values %>% mutate(LV=1:6) %>%
ggplot(aes(LV, mids)) +
geom_point() + ggtitle("Raw Data")
hist((mlb.data$Hits)^1.5, main="P=1.5")
(pow1letter.values <- lval((mlb.data$Hits)^1.5))
## depth lo hi mids spreads
## M 25.5 1542.4965 1542.496 1542.496 0.0000
## H 13.0 1185.2966 1929.734 1557.516 744.4379
## E 7.0 925.9455 2216.529 1571.237 1290.5834
## D 4.0 688.8773 2394.857 1541.867 1705.9794
## C 2.5 592.0487 2414.981 1503.515 1822.9327
## B 1.0 301.8692 2495.897 1398.883 2194.0283
select(pow1letter.values,mids)
## mids
## M 1542.496
## H 1557.516
## E 1571.237
## D 1541.867
## C 1503.515
## B 1398.883
pow1letter.values %>% mutate(LV=1:6) %>%
ggplot(aes(LV, mids)) +
geom_point() + ggtitle("P=1.5")
The sequence of midsummaries for the raw data is decreasing, indicating
left skewness. Due to the left skew, a power transformation of p=1.5 was
applied in an attempt to achieve symmetry. The midsummaries after the
transformation no longer follow this trend, which indicates the
transformed data is more symmetric.
ggplot(mlb.data,
aes(Hits,Strikeouts)) +
geom_point() +
xlab("Hits") + ylab("Strikeouts")
lm_fit <- lm(Strikeouts~Hits, data=mlb.data)
summary(lm_fit)
##
## Call:
## lm(formula = Strikeouts ~ Hits, data = mlb.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.326 -25.609 -0.008 28.051 72.447
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.5681 21.6316 2.708 0.00936 **
## Hits 0.4515 0.1580 2.858 0.00628 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36.42 on 48 degrees of freedom
## Multiple R-squared: 0.1455, Adjusted R-squared: 0.1276
## F-statistic: 8.17 on 1 and 48 DF, p-value: 0.006282
ggplot(mlb.data, aes(Hits, Strikeouts)) +
geom_point() +
geom_smooth(method="lm", se=FALSE) +
xlab("Hits") + ylab("Strikeouts")
## `geom_smooth()` using formula = 'y ~ x'
mlb.data$ls_resid <- resid(lm_fit)
plot(mlb.data$Hits, mlb.data$ls_resid, main="Least Squares Residuals", ylab="Residuals", xlab="Hits")
abline(h=0)
#### Strikeouts
stem.leaf(mlb.data$Strikeouts)
## 1 | 2: represents 120
## leaf unit: 10
## n: 50
## 1 t | 3
## 2 f | 4
## 9 s | 6666677
## 15 0. | 888899
## 24 1* | 000000111
## (11) t | 22222233333
## 15 f | 4445555
## 8 s | 666677
## 2 1. | 9
## 1 2* | 0
fivenum(mlb.data$Strikeouts)
## [1] 33 88 123 149 201
\[F_L=88, F_U=149; dF=61\\STEP=1.5(61)=91.5\] \[fence_{lower}=88-91.5=-3.5;\\fence_{upper}=149+91.5=240.5\] Using these values as fences, there are no outliers in the Strikeout data.
bins2 <- seq(32, 202, 17)
bin.mids2 <- (bins2[-1]+bins[-length(bins)])/2
ggplot(mlb.data, aes(Strikeouts)) +
geom_histogram(breaks=bins2,
fill="white",
color="red")
Because the strikeout data seems roughly symmetric, we will attempt to
fit a Gaussian curve to the data. We first need the Gaussian parameters:
\[m=\frac{88+149}{2}=118.5\\s=\frac{149-88}{1.349}\approx45.2\]
s <- fit.gaussian(mlb.data$Strikeouts, bins2, 118.5, 45.2)
options(digits=3)
(df <- data.frame(Mid=bin.mids2, d=s$counts, Prob=s$probs, e=s$expected, Residual=s$residual))
## Mid d Prob e Residual
## 1 47.0 2 0.0342 1.71 0.10571
## 2 62.5 3 0.0606 3.03 -0.00931
## 3 78.0 6 0.0934 4.67 0.28857
## 4 93.5 5 0.1251 6.25 -0.26446
## 5 109.0 8 0.1456 7.28 0.13027
## 6 124.5 8 0.1474 7.37 0.11358
## 7 140.0 7 0.1298 6.49 0.09853
## 8 155.5 7 0.0993 4.97 0.41715
## 9 171.0 2 0.0661 3.31 -0.40397
## 10 186.5 2 0.0383 1.91 0.03102
p <- ggplot(mlb.data, aes(Strikeouts)) +
geom_histogram(breaks = bins2)
out <- ggplot_build(p)$data[[1]]
select(out, count, x, xmin, xmax)
## count x xmin xmax
## 1 2 40.5 32 49
## 2 3 57.5 49 66
## 3 6 74.5 66 83
## 4 5 91.5 83 100
## 5 8 108.5 100 117
## 6 8 125.5 117 134
## 7 7 142.5 134 151
## 8 7 159.5 151 168
## 9 2 176.5 168 185
## 10 2 193.5 185 202
ggplot(out, aes(x, count)) +
geom_col() +
geom_line(data = df,
aes(bin.mids2, e), color="red")