library(LearnEDAfunctions)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: ggplot2
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.1.0     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Part 1 Transforming from Assignment 4

library(readxl)
DataMLB <- read_excel("C:/Users/KayVog22/Downloads/Final Project Data (1).xlsx")
View(DataMLB)
head(DataMLB)
## # A tibble: 6 × 8
##   Player_Name     Team     BA  Hits Walks Strikeouts Singles Extra_Base_Hits
##   <chr>           <chr> <dbl> <dbl> <dbl>      <dbl>   <dbl>           <dbl>
## 1 Mike Trout      LAA   0.303   185    75        130     130              55
## 2 Mookie Betts    LAD   0.298   170    60        120     120              50
## 3 Juan Soto       SD    0.312   180    85        110     125              55
## 4 Freddie Freeman LAD   0.301   175    70        105     120              55
## 5 José Ramírez    CLE   0.287   160    55        115     115              45
## 6 Trea Turner     PHI   0.295   172    65        135     130              42
aplpack::stem.leaf(DataMLB$Hits, m=1)
## 1 | 2: represents 12
##  leaf unit: 1
##             n: 50
##     7    15 | 0555888
##   (34)   16 | 0000000000002222223355555588888888
##     9    17 | 0000255
##     2    18 | 05
hist(DataMLB$Hits)
lines(density(DataMLB$Hits, bw=1.5), lwd=2)

plot(density(DataMLB$Hits, bw=1.5),  lwd=2, 
     axes=FALSE, 
     xlab="" ,
     ylab="",
     main="")
box()

par(mfrow=c(1,1))
hist(DataMLB$Hits, main="Raw+")

hist(sqrt(DataMLB$Hits+.05),main="ROOTS")

hist((DataMLB$Hits+.05)^0.001, main="p=0.001")

(letter.values<-lval(DataMLB$Hits))
##   depth    lo    hi   mids spreads
## M  25.5 162.5 162.5 162.50     0.0
## H  13.0 160.0 168.0 164.00     8.0
## E   7.0 158.0 170.0 164.00    12.0
## D   4.0 155.0 175.0 165.00    20.0
## C   2.5 155.0 177.5 166.25    22.5
## B   1.0 150.0 185.0 167.50    35.0
select(letter.values, mids)
##     mids
## M 162.50
## H 164.00
## E 164.00
## D 165.00
## C 166.25
## B 167.50
letter.values%>% mutate(LV=1:6)%>%
  ggplot(aes(LV, mids))+
  geom_point()+ggtitle("Raw Data")

Part 2 Smoothing from Assignment 6

ggplot(DataMLB,
        aes(Hits,Strikeouts ))+
  geom_point()

slice(DataMLB, 1:10)
## # A tibble: 10 × 8
##    Player_Name     Team     BA  Hits Walks Strikeouts Singles Extra_Base_Hits
##    <chr>           <chr> <dbl> <dbl> <dbl>      <dbl>   <dbl>           <dbl>
##  1 Mike Trout      LAA   0.303   185    75        130     130              55
##  2 Mookie Betts    LAD   0.298   170    60        120     120              50
##  3 Juan Soto       SD    0.312   180    85        110     125              55
##  4 Freddie Freeman LAD   0.301   175    70        105     120              55
##  5 José Ramírez    CLE   0.287   160    55        115     115              45
##  6 Trea Turner     PHI   0.295   172    65        135     130              42
##  7 Bo Bichette     TOR   0.282   165    50        140     120              45
##  8 Salvador Pérez  KC    0.271   150    45        110     110              40
##  9 Pete Alonso     NYM   0.264   155    40        145      95              60
## 10 Nolan Arenado   STL   0.276   160    50        125     110              50
Smooth3 <- c(NA,
             DataMLB$Hits[1:10],
             NA)
cbind(DataMLB[1:12, ], Smooth3)
##        Player_Name Team    BA Hits Walks Strikeouts Singles Extra_Base_Hits
## 1       Mike Trout  LAA 0.303  185    75        130     130              55
## 2     Mookie Betts  LAD 0.298  170    60        120     120              50
## 3        Juan Soto   SD 0.312  180    85        110     125              55
## 4  Freddie Freeman  LAD 0.301  175    70        105     120              55
## 5     José Ramírez  CLE 0.287  160    55        115     115              45
## 6      Trea Turner  PHI 0.295  172    65        135     130              42
## 7      Bo Bichette  TOR 0.282  165    50        140     120              45
## 8   Salvador Pérez   KC 0.271  150    45        110     110              40
## 9      Pete Alonso  NYM 0.264  155    40        145      95              60
## 10   Nolan Arenado  STL 0.276  160    50        125     110              50
## 11      José Abreu  CHW 0.283  162    55        120     115              47
## 12 Xander Bogaerts  BOS 0.290  168    60        130     120              48
##    Smooth3
## 1       NA
## 2      185
## 3      170
## 4      180
## 5      175
## 6      160
## 7      172
## 8      165
## 9      150
## 10     155
## 11     160
## 12      NA
DataMLB <- DataMLB %>%
  mutate(smooth.3R = as.vector(smooth(Hits, kind="3R")))
ggplot(DataMLB, aes(Strikeouts, Hits)) +
  geom_point() +
  geom_line(aes(y = smooth.3R), color="red")

DataMLB <- DataMLB %>%
  mutate(smooth.3RSS = as.vector(smooth(Hits, kind="3RSS")))
ggplot(DataMLB, aes(Strikeouts, Hits)) +
  geom_point() +
  geom_line(aes(y = smooth.3RSS), color="blue") +
  geom_line(aes(y = smooth.3R), color="red")

DataMLB <- DataMLB %>%
  mutate(smooth.3RSSH = han(as.vector(smooth(Hits, kind="3RSS"))))
ggplot(DataMLB, aes(Strikeouts, Hits)) +
  geom_point() +
  geom_line(aes(y = smooth.3RSSH), color="green")

DataMLB <- DataMLB %>%
  mutate(Rough = Hits - smooth.3RSS)
slice(DataMLB, 1:10)
## # A tibble: 10 × 12
##    Player_Name     Team     BA  Hits Walks Strikeouts Singles Extra_Base_Hits
##    <chr>           <chr> <dbl> <dbl> <dbl>      <dbl>   <dbl>           <dbl>
##  1 Mike Trout      LAA   0.303   185    75        130     130              55
##  2 Mookie Betts    LAD   0.298   170    60        120     120              50
##  3 Juan Soto       SD    0.312   180    85        110     125              55
##  4 Freddie Freeman LAD   0.301   175    70        105     120              55
##  5 José Ramírez    CLE   0.287   160    55        115     115              45
##  6 Trea Turner     PHI   0.295   172    65        135     130              42
##  7 Bo Bichette     TOR   0.282   165    50        140     120              45
##  8 Salvador Pérez  KC    0.271   150    45        110     110              40
##  9 Pete Alonso     NYM   0.264   155    40        145      95              60
## 10 Nolan Arenado   STL   0.276   160    50        125     110              50
## # ℹ 4 more variables: smooth.3R <dbl>, smooth.3RSS <dbl>, smooth.3RSSH <dbl>,
## #   Rough <dbl>
options(width = 60)
DataMLB$Rough
##  [1]   0 -10   5   0 -12   7   0 -15  -1   0   0   0   7  -6
## [15]   2   0   3  -3   3  -2   0  10   0   0   0  -2   8   0
## [29]  -2   8   0   0  -7   0   3  -5   5   0  -5   0   6   0
## [43]  -2   3  -2   0  10   0   0   0
DataMLB <- DataMLB %>%
  mutate(smooth.3RS3R.twice = as.vector(smooth(Hits, kind="3RS3R")))
ggplot(DataMLB, aes(Strikeouts, Hits)) +
  geom_point() +
  geom_line(aes(y = smooth.3RS3R.twice), color="black")

DataMLB <- DataMLB %>%
  mutate(FinalRough = Hits - smooth.3RS3R.twice)
ggplot(DataMLB, aes(Strikeouts, FinalRough)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "blue")

DataMLB <- DataMLB %>%
  mutate(size = abs(FinalRough))
stem(DataMLB$size, scale = 2)
## 
##   The decimal point is at the |
## 
##    0 | 0000000000000000000
##    1 | 
##    2 | 000000
##    3 | 00000000
##    4 | 
##    5 | 00000
##    6 | 00
##    7 | 000
##    8 | 00
##    9 | 
##   10 | 000
##   11 | 
##   12 | 0
##   13 | 
##   14 | 
##   15 | 0