library(LearnEDAfunctions)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: ggplot2
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.1.0 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Part 1 Transforming from Assignment 4
library(readxl)
DataMLB <- read_excel("C:/Users/KayVog22/Downloads/Final Project Data (1).xlsx")
View(DataMLB)
head(DataMLB)
## # A tibble: 6 × 8
## Player_Name Team BA Hits Walks Strikeouts Singles Extra_Base_Hits
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mike Trout LAA 0.303 185 75 130 130 55
## 2 Mookie Betts LAD 0.298 170 60 120 120 50
## 3 Juan Soto SD 0.312 180 85 110 125 55
## 4 Freddie Freeman LAD 0.301 175 70 105 120 55
## 5 José Ramírez CLE 0.287 160 55 115 115 45
## 6 Trea Turner PHI 0.295 172 65 135 130 42
aplpack::stem.leaf(DataMLB$Hits, m=1)
## 1 | 2: represents 12
## leaf unit: 1
## n: 50
## 7 15 | 0555888
## (34) 16 | 0000000000002222223355555588888888
## 9 17 | 0000255
## 2 18 | 05
hist(DataMLB$Hits)
lines(density(DataMLB$Hits, bw=1.5), lwd=2)

plot(density(DataMLB$Hits, bw=1.5), lwd=2,
axes=FALSE,
xlab="" ,
ylab="",
main="")
box()

par(mfrow=c(1,1))
hist(DataMLB$Hits, main="Raw+")

hist(sqrt(DataMLB$Hits+.05),main="ROOTS")

hist((DataMLB$Hits+.05)^0.001, main="p=0.001")

(letter.values<-lval(DataMLB$Hits))
## depth lo hi mids spreads
## M 25.5 162.5 162.5 162.50 0.0
## H 13.0 160.0 168.0 164.00 8.0
## E 7.0 158.0 170.0 164.00 12.0
## D 4.0 155.0 175.0 165.00 20.0
## C 2.5 155.0 177.5 166.25 22.5
## B 1.0 150.0 185.0 167.50 35.0
select(letter.values, mids)
## mids
## M 162.50
## H 164.00
## E 164.00
## D 165.00
## C 166.25
## B 167.50
letter.values%>% mutate(LV=1:6)%>%
ggplot(aes(LV, mids))+
geom_point()+ggtitle("Raw Data")

Part 2 Smoothing from Assignment 6
ggplot(DataMLB,
aes(Hits,Strikeouts ))+
geom_point()

slice(DataMLB, 1:10)
## # A tibble: 10 × 8
## Player_Name Team BA Hits Walks Strikeouts Singles Extra_Base_Hits
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mike Trout LAA 0.303 185 75 130 130 55
## 2 Mookie Betts LAD 0.298 170 60 120 120 50
## 3 Juan Soto SD 0.312 180 85 110 125 55
## 4 Freddie Freeman LAD 0.301 175 70 105 120 55
## 5 José Ramírez CLE 0.287 160 55 115 115 45
## 6 Trea Turner PHI 0.295 172 65 135 130 42
## 7 Bo Bichette TOR 0.282 165 50 140 120 45
## 8 Salvador Pérez KC 0.271 150 45 110 110 40
## 9 Pete Alonso NYM 0.264 155 40 145 95 60
## 10 Nolan Arenado STL 0.276 160 50 125 110 50
Smooth3 <- c(NA,
DataMLB$Hits[1:10],
NA)
cbind(DataMLB[1:12, ], Smooth3)
## Player_Name Team BA Hits Walks Strikeouts Singles Extra_Base_Hits
## 1 Mike Trout LAA 0.303 185 75 130 130 55
## 2 Mookie Betts LAD 0.298 170 60 120 120 50
## 3 Juan Soto SD 0.312 180 85 110 125 55
## 4 Freddie Freeman LAD 0.301 175 70 105 120 55
## 5 José Ramírez CLE 0.287 160 55 115 115 45
## 6 Trea Turner PHI 0.295 172 65 135 130 42
## 7 Bo Bichette TOR 0.282 165 50 140 120 45
## 8 Salvador Pérez KC 0.271 150 45 110 110 40
## 9 Pete Alonso NYM 0.264 155 40 145 95 60
## 10 Nolan Arenado STL 0.276 160 50 125 110 50
## 11 José Abreu CHW 0.283 162 55 120 115 47
## 12 Xander Bogaerts BOS 0.290 168 60 130 120 48
## Smooth3
## 1 NA
## 2 185
## 3 170
## 4 180
## 5 175
## 6 160
## 7 172
## 8 165
## 9 150
## 10 155
## 11 160
## 12 NA
DataMLB <- DataMLB %>%
mutate(smooth.3R = as.vector(smooth(Hits, kind="3R")))
ggplot(DataMLB, aes(Strikeouts, Hits)) +
geom_point() +
geom_line(aes(y = smooth.3R), color="red")

DataMLB <- DataMLB %>%
mutate(smooth.3RSS = as.vector(smooth(Hits, kind="3RSS")))
ggplot(DataMLB, aes(Strikeouts, Hits)) +
geom_point() +
geom_line(aes(y = smooth.3RSS), color="blue") +
geom_line(aes(y = smooth.3R), color="red")

DataMLB <- DataMLB %>%
mutate(smooth.3RSSH = han(as.vector(smooth(Hits, kind="3RSS"))))
ggplot(DataMLB, aes(Strikeouts, Hits)) +
geom_point() +
geom_line(aes(y = smooth.3RSSH), color="green")

DataMLB <- DataMLB %>%
mutate(Rough = Hits - smooth.3RSS)
slice(DataMLB, 1:10)
## # A tibble: 10 × 12
## Player_Name Team BA Hits Walks Strikeouts Singles Extra_Base_Hits
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mike Trout LAA 0.303 185 75 130 130 55
## 2 Mookie Betts LAD 0.298 170 60 120 120 50
## 3 Juan Soto SD 0.312 180 85 110 125 55
## 4 Freddie Freeman LAD 0.301 175 70 105 120 55
## 5 José Ramírez CLE 0.287 160 55 115 115 45
## 6 Trea Turner PHI 0.295 172 65 135 130 42
## 7 Bo Bichette TOR 0.282 165 50 140 120 45
## 8 Salvador Pérez KC 0.271 150 45 110 110 40
## 9 Pete Alonso NYM 0.264 155 40 145 95 60
## 10 Nolan Arenado STL 0.276 160 50 125 110 50
## # ℹ 4 more variables: smooth.3R <dbl>, smooth.3RSS <dbl>, smooth.3RSSH <dbl>,
## # Rough <dbl>
options(width = 60)
DataMLB$Rough
## [1] 0 -10 5 0 -12 7 0 -15 -1 0 0 0 7 -6
## [15] 2 0 3 -3 3 -2 0 10 0 0 0 -2 8 0
## [29] -2 8 0 0 -7 0 3 -5 5 0 -5 0 6 0
## [43] -2 3 -2 0 10 0 0 0
DataMLB <- DataMLB %>%
mutate(smooth.3RS3R.twice = as.vector(smooth(Hits, kind="3RS3R")))
ggplot(DataMLB, aes(Strikeouts, Hits)) +
geom_point() +
geom_line(aes(y = smooth.3RS3R.twice), color="black")

DataMLB <- DataMLB %>%
mutate(FinalRough = Hits - smooth.3RS3R.twice)
ggplot(DataMLB, aes(Strikeouts, FinalRough)) +
geom_point() +
geom_hline(yintercept = 0, color = "blue")

DataMLB <- DataMLB %>%
mutate(size = abs(FinalRough))
stem(DataMLB$size, scale = 2)
##
## The decimal point is at the |
##
## 0 | 0000000000000000000
## 1 |
## 2 | 000000
## 3 | 00000000
## 4 |
## 5 | 00000
## 6 | 00
## 7 | 000
## 8 | 00
## 9 |
## 10 | 000
## 11 |
## 12 | 0
## 13 |
## 14 |
## 15 | 0