library(LearnEDAfunctions)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: ggplot2
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.1.0 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
DataMLB <- read_excel("C:/Users/KayVog22/Downloads/Final Project Data (1).xlsx")
View(DataMLB)
head(DataMLB)
## # A tibble: 6 × 8
## Player_Name Team BA Hits Walks Strikeouts Singles Extra_Base_Hits
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mike Trout LAA 0.303 185 75 130 130 55
## 2 Mookie Betts LAD 0.298 170 60 120 120 50
## 3 Juan Soto SD 0.312 180 85 110 125 55
## 4 Freddie Freeman LAD 0.301 175 70 105 120 55
## 5 José Ramírez CLE 0.287 160 55 115 115 45
## 6 Trea Turner PHI 0.295 172 65 135 130 42
aplpack::stem.leaf(DataMLB$Hits, m=1)
## 1 | 2: represents 12
## leaf unit: 1
## n: 50
## 7 15 | 0555888
## (34) 16 | 0000000000002222223355555588888888
## 9 17 | 0000255
## 2 18 | 05
hist(DataMLB$Hits)
lines(density(DataMLB$Hits, bw=1.5), lwd=2)
plot(density(DataMLB$Hits, bw=1.5), lwd=2,
axes=FALSE,
xlab="" ,
ylab="",
main="")
box()
par(mfrow=c(1,1))
hist(DataMLB$Hits, main="Raw+")
hist(sqrt(DataMLB$Hits+.05),main="ROOTS")
hist((DataMLB$Hits+.05)^0.001, main="p=0.001")
(letter.values<-lval(DataMLB$Hits))
## depth lo hi mids spreads
## M 25.5 162.5 162.5 162.50 0.0
## H 13.0 160.0 168.0 164.00 8.0
## E 7.0 158.0 170.0 164.00 12.0
## D 4.0 155.0 175.0 165.00 20.0
## C 2.5 155.0 177.5 166.25 22.5
## B 1.0 150.0 185.0 167.50 35.0
select(letter.values, mids)
## mids
## M 162.50
## H 164.00
## E 164.00
## D 165.00
## C 166.25
## B 167.50
letter.values%>% mutate(LV=1:6)%>%
ggplot(aes(LV, mids))+
geom_point()+ggtitle("Raw Data")
The exploratory analysis of MLB player hit totals shows that the Hits variable is unimodal with most players clustered in the middle range and fewer players achieving very high totals. The histogram and density curve indicate mild right skewness, meaning a small number of elite players push the upper tail upward. The stem-and-leaf plot and letter-value summaries confirm that the distribution is smooth and continuous, with no major gaps or extreme outliers. Transformation diagnostics show that applying a square-root transformation reduces the skewness and makes the distribution more symmetric, reinforcing the underlying pattern. Overall, the data follow a stable, predictable structure, making Hits a reliable variable for smoothing, trend analysis, and further exploratory work.
ggplot(DataMLB,
aes(Extra_Base_Hits, Hits))+
geom_point()
slice(DataMLB, 1:10)
## # A tibble: 10 × 8
## Player_Name Team BA Hits Walks Strikeouts Singles Extra_Base_Hits
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mike Trout LAA 0.303 185 75 130 130 55
## 2 Mookie Betts LAD 0.298 170 60 120 120 50
## 3 Juan Soto SD 0.312 180 85 110 125 55
## 4 Freddie Freeman LAD 0.301 175 70 105 120 55
## 5 José Ramírez CLE 0.287 160 55 115 115 45
## 6 Trea Turner PHI 0.295 172 65 135 130 42
## 7 Bo Bichette TOR 0.282 165 50 140 120 45
## 8 Salvador Pérez KC 0.271 150 45 110 110 40
## 9 Pete Alonso NYM 0.264 155 40 145 95 60
## 10 Nolan Arenado STL 0.276 160 50 125 110 50
Smooth3 <- c(NA,
DataMLB$Extra_Base_Hits[1:10],
NA)
cbind(DataMLB[1:12, ], Smooth3)
## Player_Name Team BA Hits Walks Strikeouts Singles Extra_Base_Hits
## 1 Mike Trout LAA 0.303 185 75 130 130 55
## 2 Mookie Betts LAD 0.298 170 60 120 120 50
## 3 Juan Soto SD 0.312 180 85 110 125 55
## 4 Freddie Freeman LAD 0.301 175 70 105 120 55
## 5 José Ramírez CLE 0.287 160 55 115 115 45
## 6 Trea Turner PHI 0.295 172 65 135 130 42
## 7 Bo Bichette TOR 0.282 165 50 140 120 45
## 8 Salvador Pérez KC 0.271 150 45 110 110 40
## 9 Pete Alonso NYM 0.264 155 40 145 95 60
## 10 Nolan Arenado STL 0.276 160 50 125 110 50
## 11 José Abreu CHW 0.283 162 55 120 115 47
## 12 Xander Bogaerts BOS 0.290 168 60 130 120 48
## Smooth3
## 1 NA
## 2 55
## 3 50
## 4 55
## 5 55
## 6 45
## 7 42
## 8 45
## 9 40
## 10 60
## 11 50
## 12 NA
cbind(DataMLB[1:12, ], Smooth3)
## Player_Name Team BA Hits Walks Strikeouts Singles Extra_Base_Hits
## 1 Mike Trout LAA 0.303 185 75 130 130 55
## 2 Mookie Betts LAD 0.298 170 60 120 120 50
## 3 Juan Soto SD 0.312 180 85 110 125 55
## 4 Freddie Freeman LAD 0.301 175 70 105 120 55
## 5 José Ramírez CLE 0.287 160 55 115 115 45
## 6 Trea Turner PHI 0.295 172 65 135 130 42
## 7 Bo Bichette TOR 0.282 165 50 140 120 45
## 8 Salvador Pérez KC 0.271 150 45 110 110 40
## 9 Pete Alonso NYM 0.264 155 40 145 95 60
## 10 Nolan Arenado STL 0.276 160 50 125 110 50
## 11 José Abreu CHW 0.283 162 55 120 115 47
## 12 Xander Bogaerts BOS 0.290 168 60 130 120 48
## Smooth3
## 1 NA
## 2 55
## 3 50
## 4 55
## 5 55
## 6 45
## 7 42
## 8 45
## 9 40
## 10 60
## 11 50
## 12 NA
DataMLB <- DataMLB %>%
mutate(smooth.3R = as.vector(smooth(Extra_Base_Hits, kind="3R")))
DataMLB <- DataMLB %>%
mutate(smooth.3R = as.vector(smooth(Extra_Base_Hits, kind="3R")))
ggplot(DataMLB, aes(Hits, Extra_Base_Hits)) +
geom_point() +
geom_line(aes(y = smooth.3R), color="red")
DataMLB <- DataMLB %>%
mutate(smooth.3RSS = as.vector(smooth(Extra_Base_Hits, kind="3RSS")))
ggplot(DataMLB, aes(Hits, Extra_Base_Hits)) +
geom_point() +
geom_line(aes(y = smooth.3RSS), color="blue") +
geom_line(aes(y = smooth.3R), color="red")
DataMLB <- DataMLB %>%
mutate(smooth.3RSSH = han(as.vector(smooth(Extra_Base_Hits, kind="3RSS"))))
ggplot(DataMLB, aes(Hits, Extra_Base_Hits)) +
geom_point() +
geom_line(aes(y = smooth.3RSSH), color="green")
DataMLB <- DataMLB %>%
mutate(Rough = Extra_Base_Hits - smooth.3RSS)
slice(DataMLB, 1:10)
## # A tibble: 10 × 12
## Player_Name Team BA Hits Walks Strikeouts Singles Extra_Base_Hits
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mike Trout LAA 0.303 185 75 130 130 55
## 2 Mookie Betts LAD 0.298 170 60 120 120 50
## 3 Juan Soto SD 0.312 180 85 110 125 55
## 4 Freddie Freeman LAD 0.301 175 70 105 120 55
## 5 José Ramírez CLE 0.287 160 55 115 115 45
## 6 Trea Turner PHI 0.295 172 65 135 130 42
## 7 Bo Bichette TOR 0.282 165 50 140 120 45
## 8 Salvador Pérez KC 0.271 150 45 110 110 40
## 9 Pete Alonso NYM 0.264 155 40 145 95 60
## 10 Nolan Arenado STL 0.276 160 50 125 110 50
## # ℹ 4 more variables: smooth.3R <dbl>, smooth.3RSS <dbl>, smooth.3RSSH <dbl>,
## # Rough <dbl>
options(width = 60)
DataMLB$Rough
## [1] 0 -5 0 0 0 -3 0 -5 15 2 -1 0 12 -3 0 0 0 0
## [19] 0 0 0 4 6 -7 -1 0 2 0 0 5 0 -1 0 0 0 -2
## [37] 2 0 -3 0 3 -1 0 0 0 0 4 6 -2 0
DataMLB <- DataMLB %>%
mutate(smooth.3RS3R.twice = as.vector(smooth(Extra_Base_Hits, kind="3RS3R")))
ggplot(DataMLB, aes(Hits, Extra_Base_Hits)) +
geom_point() +
geom_line(aes(y = smooth.3RS3R.twice), color="black")
DataMLB <- DataMLB %>%
mutate(FinalRough = Extra_Base_Hits - smooth.3RS3R.twice)
ggplot(DataMLB, aes(Hits, FinalRough)) +
geom_point() +
geom_hline(yintercept = 0, color = "blue")
DataMLB <- DataMLB %>%
mutate(size = abs(FinalRough))
stem(DataMLB$size, scale = 2)
##
## The decimal point is at the |
##
## 0 | 00000000000000000000000000
## 1 | 0000
## 2 | 000000
## 3 | 00000
## 4 | 00
## 5 | 0000
## 6 | 0
## 7 |
## 8 |
## 9 |
## 10 |
## 11 |
## 12 | 0
## 13 |
## 14 |
## 15 | 0
I explored the relationship between Hits and Extra-Base Hits for 50 MLB players. First, I used scatterplots to visualize raw patterns. Then, I applied several smoothing techniques (3R, 3RSS, Hanning, and repeated smoothing) to uncover the underlying trend in the data without assuming a specific model.
After identifying the general trend, I calculated “rough” values — the difference between the actual player performance and the smoothed trend. These residuals show which players hit more or fewer extra-base hits than expected based on their total hits. Finally, I analyzed the distribution of these rough values to understand player variability and identify potential outliers.
This exploratory approach allowed me to interpret hitting performance patterns, discover player types (power vs contact), and better understand the structure in MLB batting data.
To explore patterns in MLB player performance, I focused on the relationship between Hits and Extra-Base Hits, two variables that represent both consistency and power at the plate. I began by creating a scatterplot to visualize the raw relationship, which showed a clear positive association: players who recorded more hits also tended to produce more extra-base hits. Because raw scatterplots can be noisy, I applied a series of robust smoothing techniques (3R, 3RSS, 3RSSH, and 3RS3R) to uncover the underlying trend without being overly influenced by extreme values. The 3R smoother provided an initial, resistant estimate of the trend, while the 3RSS and 3RSSH smoothers created a more refined and stable curve by reducing local fluctuations. This allowed me to better see the general shape of the relationship rather than the random variation between individual players. I also calculated Rough and FinalRough, which measure how far each observation deviates from the smoothed trend. These residual-like values helped identify players who performed differently than expected—either overperforming in extra-base hits relative to their total hits or underperforming.
Overall, the smoothing techniques helped reveal the true structure of the relationship between hitting volume and hitting power. The analysis showed that while extra-base hits generally increase with total hits, the rate of increase is not constant, which becomes clearer once noise is reduced. By applying resistant smoothers, I was able to focus on the meaningful pattern in the data and better understand outliers and deviations. This interpretation is essential in exploratory data analysis because the goal is not just to apply methods, but to explain what the data are suggesting and how each statistical tool helps uncover that story.