library(tidyverse)
library(dplyr)
library(knitr)
Build A nfoldconvolution Function
- Being that the dice example is easier to visualize, I use it to figure out how to program a Nfoldconvolution function
- Our space \(S_2=X_1+X_2\)
# X1
x_1 <- c(rep(1/6, 6))
# x2
x_2 <- c(rep(1/6, 6))
# S2=x1+x2
s_2 <- convolve(x_1, x_2, type = "open")
s_2
## [1] 0.02777778 0.05555556 0.08333333 0.11111111 0.13888889 0.16666667
## [7] 0.13888889 0.11111111 0.08333333 0.05555556 0.02777778
Sanity and Visual Check
- \(S_2(2)= \frac{1}{6}*\frac{1}{6}\)
- \(S_2(12)= \frac{1}{6}*\frac{1}{6}\)
## sanity check sum 2, 12 should equal 1/6*1/6
dice_add_to_2 <- s_2[1]
dice_add_to_12 <- s_2[11]
round(dice_add_to_2, 2) == round(dice_add_to_12, 2)
## [1] TRUE
round(1/36, 2) == round(dice_add_to_2, 2)
## [1] TRUE
## visual check
s_2_dice <- sample(2:12, 50000, s_2, replace = TRUE)
hist(s_2_dice)

Build s3
\(S_3= x_3+s_2\)
# s3
s_3 <- convolve(s_2,rev(x_2), type = "open")
hist(sample(3:18, 50000, s_3, replace = TRUE))

Build n_fold_convolution
- Function only works with discrete continuous integers as data
- Dispalys a simulation of results
- Calculates and returns actual probability distribution
my_n_fold_convolution <- function(s, prob, labels, start, stop)
{
fold <- 2
s_distribution <- prob
while (fold <= s)
{
s_distribution <- convolve(prob, rev(s_distribution), conj = TRUE,
type = "open")
fold <- fold + 1
}
labels_start <- s * start
label_end <- s * stop
my_labels <- labels_start:label_end
hist(sample(my_labels, 2e+05, s_distribution, replace = TRUE), main = "histogram of simulation")
return(s_distribution)
}
Test Function
- Tested on 2 and 3 everything seems fine
dice <- 1:6
my_s2 <- my_n_fold_convolution(2, x_2, dice, 1, 6)

my_s3 <- my_n_fold_convolution(3, x_2, dice, 1, 6)

kable(as_data_frame(cbind(my_s3, s3 = s_3 ,my_s2, s2 = s_2)))
| 0.0046296 |
0.0046296 |
0.0277778 |
0.0277778 |
| 0.0138889 |
0.0138889 |
0.0555556 |
0.0555556 |
| 0.0277778 |
0.0277778 |
0.0833333 |
0.0833333 |
| 0.0462963 |
0.0462963 |
0.1111111 |
0.1111111 |
| 0.0694444 |
0.0694444 |
0.1388889 |
0.1388889 |
| 0.0972222 |
0.0972222 |
0.1666667 |
0.1666667 |
| 0.1157407 |
0.1157407 |
0.1388889 |
0.1388889 |
| 0.1250000 |
0.1250000 |
0.1111111 |
0.1111111 |
| 0.1250000 |
0.1250000 |
0.0833333 |
0.0833333 |
| 0.1157407 |
0.1157407 |
0.0555556 |
0.0555556 |
| 0.0972222 |
0.0972222 |
0.0277778 |
0.0277778 |
| 0.0694444 |
0.0694444 |
0.0277778 |
0.0277778 |
| 0.0462963 |
0.0462963 |
0.0555556 |
0.0555556 |
| 0.0277778 |
0.0277778 |
0.0833333 |
0.0833333 |
| 0.0138889 |
0.0138889 |
0.1111111 |
0.1111111 |
| 0.0046296 |
0.0046296 |
0.1388889 |
0.1388889 |
Problem 7.4
Solve problem
- Amount of winnings is (1,2,3)
- I want s(10)
- probability= (1/4,1/4,1/2)
- start=1
- end=3
winnings_dist <- my_n_fold_convolution(s = 10, prob = c(1/4, 1/4, 1/2),
labels = c(1, 2, 3), start = 1, stop = 3)

## [1] 9.536743e-07 9.536743e-06 6.198883e-05 2.861023e-04 1.058578e-03
## [6] 3.215790e-03 8.325577e-03 1.853943e-02 3.609180e-02 6.157875e-02
## [11] 9.277821e-02 1.231575e-01 1.443672e-01 1.483154e-01 1.332092e-01
## [16] 1.029053e-01 6.774902e-02 3.662109e-02 1.586914e-02 4.882813e-03
## [21] 9.765625e-04
Conclusion
- Originally i didnt want to do this problem, and I wanted to attempt the baseball question, question 8.
- Let’s try question 8
Problem 7.8
our_probability <- c(0.4, 0.2, 0.2, 0.1, 0.1)
my_labels <- 0:4
start <- 0
end <- 4
Histograms of simulations for 4,5,6,7 games
four_games <- my_n_fold_convolution(s = 4, prob = our_probability, labels = my_labels,
start = 0, stop = 4)

five_games <- my_n_fold_convolution(s = 5, prob = our_probability, labels = my_labels,
start = 0, stop = 4)

six_games <- my_n_fold_convolution(s = 6, prob = our_probability, labels = my_labels,
start = 0, stop = 4)

seven_games <- my_n_fold_convolution(s = 7 , prob = our_probability, labels = my_labels,
start = 0, stop = 4)

Hit Distributions For 4 game series
expected_hits_fourgames <- as_data_frame(cbind(four_games, hits = start:(end *
4)))
kable(expected_hits_fourgames)
| 0.0256 |
0 |
| 0.0512 |
1 |
| 0.0896 |
2 |
| 0.1152 |
3 |
| 0.1424 |
4 |
| 0.1408 |
5 |
| 0.1312 |
6 |
| 0.1056 |
7 |
| 0.0808 |
8 |
| 0.0528 |
9 |
| 0.0328 |
10 |
| 0.0176 |
11 |
| 0.0089 |
12 |
| 0.0036 |
13 |
| 0.0014 |
14 |
| 0.0004 |
15 |
| 0.0001 |
16 |
Quick Sanity Check
- Above table shows 0 hits as .0256
- Above table shows 16 hits as .0001
## [1] 0.0256
## [1] 1e-04
Hit Distributions For 5,6,7 Games
# 5 games
expected_hits_fivegames <- as_data_frame(cbind(five_games, hits = start:(end *
5)))
kable(expected_hits_fivegames)
| 0.01024 |
0 |
| 0.02560 |
1 |
| 0.05120 |
2 |
| 0.07680 |
3 |
| 0.10560 |
4 |
| 0.12192 |
5 |
| 0.12960 |
6 |
| 0.12240 |
7 |
| 0.10800 |
8 |
| 0.08560 |
9 |
| 0.06352 |
10 |
| 0.04280 |
11 |
| 0.02700 |
12 |
| 0.01530 |
13 |
| 0.00810 |
14 |
| 0.00381 |
15 |
| 0.00165 |
16 |
| 0.00060 |
17 |
| 0.00020 |
18 |
| 0.00005 |
19 |
| 0.00001 |
20 |
# 6 games
expected_hits_sixgames <- as_data_frame(cbind(six_games, hits = start:(end *
6)))
kable(expected_hits_sixgames)
| 0.004096 |
0 |
| 0.012288 |
1 |
| 0.027648 |
2 |
| 0.047104 |
3 |
| 0.071424 |
4 |
| 0.092928 |
5 |
| 0.110144 |
6 |
| 0.117504 |
7 |
| 0.116352 |
8 |
| 0.105472 |
9 |
| 0.089328 |
10 |
| 0.069984 |
11 |
| 0.051424 |
12 |
| 0.034992 |
13 |
| 0.022332 |
14 |
| 0.013184 |
15 |
| 0.007272 |
16 |
| 0.003672 |
17 |
| 0.001721 |
18 |
| 0.000726 |
19 |
| 0.000279 |
20 |
| 0.000092 |
21 |
| 0.000027 |
22 |
| 0.000006 |
23 |
| 0.000001 |
24 |
# 7 games
expected_hits_sevengames <- as_data_frame(cbind(seven_games, hits = start:(end *
7)))
kable(expected_hits_sevengames)
| 0.0016384 |
0 |
| 0.0057344 |
1 |
| 0.0143360 |
2 |
| 0.0272384 |
3 |
| 0.0451584 |
4 |
| 0.0648704 |
5 |
| 0.0844032 |
6 |
| 0.0994688 |
7 |
| 0.1085056 |
8 |
| 0.1092672 |
9 |
| 0.1028608 |
10 |
| 0.0903392 |
11 |
| 0.0746144 |
12 |
| 0.0577584 |
13 |
| 0.0421472 |
14 |
| 0.0288792 |
15 |
| 0.0186536 |
16 |
| 0.0112924 |
17 |
| 0.0064288 |
18 |
| 0.0034146 |
19 |
| 0.0016954 |
20 |
| 0.0007771 |
21 |
| 0.0003297 |
22 |
| 0.0001267 |
23 |
| 0.0000441 |
24 |
| 0.0000133 |
25 |
| 0.0000035 |
26 |
| 0.0000007 |
27 |
| 0.0000001 |
28 |
B batting average above .400
- To calculate the batting average above .400 we take total probability given a players \(\frac{hits}{ab}>.400\)
- Below we first calculate how many hits a player needs in each series length to bat 400 and above
- Then we find the sum of the probability of getting that hit amount and above
- Answer
- 4 games = 0.304
- 5 games = .357
- 6 games = .295
- 7 games = .246
- The reason for the large variety in outcomes is that our data is discrete and therefore depending on how many at bats occur, the amount of hits required to reach .400 varies.
- In 5 games(highest probability),8 hiss are required to reach .400, 8/20 = .400
- In 7 games(lowest probability), 12 hits are required to reach .400, 12/28 = .428
## # A tibble: 17 x 2
## four_games hits
## <dbl> <dbl>
## 1 0.0256 0
## 2 0.0512 1
## 3 0.0896 2
## 4 0.115 3
## 5 0.142 4
## 6 0.141 5
## 7 0.131 6
## 8 0.106 7
## 9 0.0808 8
## 10 0.0528 9
## 11 0.0328 10
## 12 0.0176 11
## 13 0.00890 12
## 14 0.00360 13
## 15 0.00140 14
## 16 0.000400 15
## 17 0.0001000 16
# calculate hits needed 4 games
four_gamesba <- 7/16
# filter and summarize
expected_hits_fourgames %>% filter(hits >= 7) %>% summarize(sum(four_games))
## # A tibble: 1 x 1
## `sum(four_games)`
## <dbl>
## 1 0.304
# calculate hits needed 5 games
five_gamesba <- 8/20
# filter and summarize
expected_hits_fivegames %>% filter(hits >= 8) %>% summarize(sum(five_games))
## # A tibble: 1 x 1
## `sum(five_games)`
## <dbl>
## 1 0.357
# calculate hits needed 6 games
six_gamesba <- 10/24
# filter and summarize
expected_hits_sixgames %>% filter(hits >= 10) %>% summarize(sum(six_games))
## # A tibble: 1 x 1
## `sum(six_games)`
## <dbl>
## 1 0.295
# calculate hits needed 7 games
seven_gamesba <- 12/28
# filter and summarize
expected_hits_sevengames %>% filter(hits >= 12) %>% summarize(sum(seven_games))
## # A tibble: 1 x 1
## `sum(seven_games)`
## <dbl>
## 1 0.246
Long Term Batting Average
- Long term batting average is just the mean expected value of the distribution
- Can be solved via simulation
- Assuming 165 games in a year and ten year career, we can simulate with n=1650
- This outcome is likely to differ from theoretical distribution considering sample size
- .328
- Strictly using distribution function, function seems to break
- I got rid of graph and solved programatically
- expected batting average .325
Solve Using Simulation
## simulation
set.seed(5)
batting_avg_longterm <- sample(0:4, 365, our_probability, replace = TRUE)
sum(batting_avg_longterm)/(length(batting_avg_longterm) * 4)
## [1] 0.3267123
Solve With Distribution Function
# Solution A
total_career_hit_distribution <- my_n_fold_convolution(s = 1650, prob = our_probability,
my_labels, 0, 4)
my_df <- as_data_frame(cbind(total_career_hit_distribution, (0:6600)))
## expected value of hits at .500 of distribution
sum(my_df$total_career_hit_distribution[0:2145])
## [1] 0.4974554
## 2145 hits/ 6600 total ab
2145/6600
## [1] 0.325
# Alternative solution
my_cdf <- dplyr::as_data_frame(cbind(cumsum(my_df$total_career_hit_distribution),
my_df$V1))
my_cdf %>% filter(V1 > 0.5 & V1 < 0.51)
## # A tibble: 1 x 2
## V1 V2
## <dbl> <dbl>
## 1 0.505 2145