library(tidyverse)
library(dplyr)
library(knitr)

Build A nfoldconvolution Function

  • Being that the dice example is easier to visualize, I use it to figure out how to program a Nfoldconvolution function
  • Our space \(S_2=X_1+X_2\)
# X1
x_1 <- c(rep(1/6, 6))
# x2
x_2 <- c(rep(1/6, 6))
# S2=x1+x2
s_2 <- convolve(x_1, x_2, type = "open")
s_2
##  [1] 0.02777778 0.05555556 0.08333333 0.11111111 0.13888889 0.16666667
##  [7] 0.13888889 0.11111111 0.08333333 0.05555556 0.02777778

Sanity and Visual Check

  • \(S_2(2)= \frac{1}{6}*\frac{1}{6}\)
  • \(S_2(12)= \frac{1}{6}*\frac{1}{6}\)
## sanity check sum 2, 12 should equal 1/6*1/6
dice_add_to_2 <- s_2[1]
dice_add_to_12 <- s_2[11]
round(dice_add_to_2, 2) == round(dice_add_to_12, 2)
## [1] TRUE
round(1/36, 2) == round(dice_add_to_2, 2)
## [1] TRUE
## visual check
s_2_dice <- sample(2:12, 50000, s_2, replace = TRUE)
hist(s_2_dice)

Build s3

\(S_3= x_3+s_2\)

# s3
s_3 <- convolve(s_2,rev(x_2), type = "open")
hist(sample(3:18, 50000, s_3, replace = TRUE))

Build n_fold_convolution

  • Function only works with discrete continuous integers as data
    • Dispalys a simulation of results
    • Calculates and returns actual probability distribution
my_n_fold_convolution <- function(s, prob, labels, start, stop)
{
    fold <- 2
    s_distribution <- prob
    while (fold <= s)
    {
        s_distribution <- convolve(prob, rev(s_distribution), conj = TRUE, 
            type = "open")
        fold <- fold + 1
    }
    labels_start <- s * start
    label_end <- s * stop
    my_labels <- labels_start:label_end
    hist(sample(my_labels, 2e+05, s_distribution, replace = TRUE), main = "histogram of simulation")
    return(s_distribution)
}

Test Function

  • Tested on 2 and 3 everything seems fine
dice <- 1:6
my_s2 <- my_n_fold_convolution(2, x_2, dice, 1, 6)

my_s3 <- my_n_fold_convolution(3, x_2, dice, 1, 6)

kable(as_data_frame(cbind(my_s3, s3 = s_3 ,my_s2, s2 = s_2)))
my_s3 s3 my_s2 s2
0.0046296 0.0046296 0.0277778 0.0277778
0.0138889 0.0138889 0.0555556 0.0555556
0.0277778 0.0277778 0.0833333 0.0833333
0.0462963 0.0462963 0.1111111 0.1111111
0.0694444 0.0694444 0.1388889 0.1388889
0.0972222 0.0972222 0.1666667 0.1666667
0.1157407 0.1157407 0.1388889 0.1388889
0.1250000 0.1250000 0.1111111 0.1111111
0.1250000 0.1250000 0.0833333 0.0833333
0.1157407 0.1157407 0.0555556 0.0555556
0.0972222 0.0972222 0.0277778 0.0277778
0.0694444 0.0694444 0.0277778 0.0277778
0.0462963 0.0462963 0.0555556 0.0555556
0.0277778 0.0277778 0.0833333 0.0833333
0.0138889 0.0138889 0.1111111 0.1111111
0.0046296 0.0046296 0.1388889 0.1388889

Problem 7.4

Solve problem

  • Amount of winnings is (1,2,3)
  • I want s(10)
  • probability= (1/4,1/4,1/2)
  • start=1
  • end=3
winnings_dist <- my_n_fold_convolution(s = 10, prob = c(1/4, 1/4, 1/2), 
    labels = c(1, 2, 3), start = 1, stop = 3)

winnings_dist
##  [1] 9.536743e-07 9.536743e-06 6.198883e-05 2.861023e-04 1.058578e-03
##  [6] 3.215790e-03 8.325577e-03 1.853943e-02 3.609180e-02 6.157875e-02
## [11] 9.277821e-02 1.231575e-01 1.443672e-01 1.483154e-01 1.332092e-01
## [16] 1.029053e-01 6.774902e-02 3.662109e-02 1.586914e-02 4.882813e-03
## [21] 9.765625e-04

Conclusion

  • Originally i didnt want to do this problem, and I wanted to attempt the baseball question, question 8.
  • Let’s try question 8

Problem 7.8

our_probability <- c(0.4, 0.2, 0.2, 0.1, 0.1)
my_labels <- 0:4
start <- 0
end <- 4

Histograms of simulations for 4,5,6,7 games

four_games <- my_n_fold_convolution(s = 4, prob = our_probability, labels = my_labels, 
    start = 0, stop =  4)

five_games <- my_n_fold_convolution(s = 5, prob = our_probability, labels = my_labels, 
    start = 0, stop = 4)

six_games <- my_n_fold_convolution(s = 6, prob = our_probability, labels = my_labels, 
    start = 0, stop = 4)

seven_games <- my_n_fold_convolution(s = 7 , prob = our_probability, labels = my_labels, 
    start = 0, stop = 4)

Hit Distributions For 4 game series

expected_hits_fourgames <- as_data_frame(cbind(four_games, hits = start:(end * 
    4)))
kable(expected_hits_fourgames)
four_games hits
0.0256 0
0.0512 1
0.0896 2
0.1152 3
0.1424 4
0.1408 5
0.1312 6
0.1056 7
0.0808 8
0.0528 9
0.0328 10
0.0176 11
0.0089 12
0.0036 13
0.0014 14
0.0004 15
0.0001 16

Quick Sanity Check

  • Above table shows 0 hits as .0256
    • should be .4 **4
  • Above table shows 16 hits as .0001
    • should be .1*4
print(0.4^4)
## [1] 0.0256
print(0.1^4)
## [1] 1e-04

Hit Distributions For 5,6,7 Games

# 5 games
expected_hits_fivegames <- as_data_frame(cbind(five_games, hits = start:(end * 
    5)))
kable(expected_hits_fivegames)
five_games hits
0.01024 0
0.02560 1
0.05120 2
0.07680 3
0.10560 4
0.12192 5
0.12960 6
0.12240 7
0.10800 8
0.08560 9
0.06352 10
0.04280 11
0.02700 12
0.01530 13
0.00810 14
0.00381 15
0.00165 16
0.00060 17
0.00020 18
0.00005 19
0.00001 20
# 6 games
expected_hits_sixgames <- as_data_frame(cbind(six_games, hits = start:(end * 
    6)))
kable(expected_hits_sixgames)
six_games hits
0.004096 0
0.012288 1
0.027648 2
0.047104 3
0.071424 4
0.092928 5
0.110144 6
0.117504 7
0.116352 8
0.105472 9
0.089328 10
0.069984 11
0.051424 12
0.034992 13
0.022332 14
0.013184 15
0.007272 16
0.003672 17
0.001721 18
0.000726 19
0.000279 20
0.000092 21
0.000027 22
0.000006 23
0.000001 24
# 7 games
expected_hits_sevengames <- as_data_frame(cbind(seven_games, hits = start:(end * 
    7)))
kable(expected_hits_sevengames)
seven_games hits
0.0016384 0
0.0057344 1
0.0143360 2
0.0272384 3
0.0451584 4
0.0648704 5
0.0844032 6
0.0994688 7
0.1085056 8
0.1092672 9
0.1028608 10
0.0903392 11
0.0746144 12
0.0577584 13
0.0421472 14
0.0288792 15
0.0186536 16
0.0112924 17
0.0064288 18
0.0034146 19
0.0016954 20
0.0007771 21
0.0003297 22
0.0001267 23
0.0000441 24
0.0000133 25
0.0000035 26
0.0000007 27
0.0000001 28

B batting average above .400

  • To calculate the batting average above .400 we take total probability given a players \(\frac{hits}{ab}>.400\)
    • Below we first calculate how many hits a player needs in each series length to bat 400 and above
    • Then we find the sum of the probability of getting that hit amount and above
  • Answer
    • 4 games = 0.304
    • 5 games = .357
    • 6 games = .295
    • 7 games = .246
    • The reason for the large variety in outcomes is that our data is discrete and therefore depending on how many at bats occur, the amount of hits required to reach .400 varies.
      • In 5 games(highest probability),8 hiss are required to reach .400, 8/20 = .400
      • In 7 games(lowest probability), 12 hits are required to reach .400, 12/28 = .428
expected_hits_fourgames
## # A tibble: 17 x 2
##    four_games  hits
##         <dbl> <dbl>
##  1  0.0256        0
##  2  0.0512        1
##  3  0.0896        2
##  4  0.115         3
##  5  0.142         4
##  6  0.141         5
##  7  0.131         6
##  8  0.106         7
##  9  0.0808        8
## 10  0.0528        9
## 11  0.0328       10
## 12  0.0176       11
## 13  0.00890      12
## 14  0.00360      13
## 15  0.00140      14
## 16  0.000400     15
## 17  0.0001000    16
# calculate hits needed 4 games
four_gamesba <- 7/16
# filter and summarize
expected_hits_fourgames %>% filter(hits >= 7) %>% summarize(sum(four_games))
## # A tibble: 1 x 1
##   `sum(four_games)`
##               <dbl>
## 1             0.304
# calculate hits needed 5 games
five_gamesba <- 8/20
# filter and summarize
expected_hits_fivegames %>% filter(hits >= 8) %>% summarize(sum(five_games))
## # A tibble: 1 x 1
##   `sum(five_games)`
##               <dbl>
## 1             0.357
# calculate hits needed 6 games
six_gamesba <- 10/24
# filter and summarize
expected_hits_sixgames %>% filter(hits >= 10) %>% summarize(sum(six_games))
## # A tibble: 1 x 1
##   `sum(six_games)`
##              <dbl>
## 1            0.295
# calculate hits needed 7 games
seven_gamesba <- 12/28
# filter and summarize
expected_hits_sevengames %>% filter(hits >= 12) %>% summarize(sum(seven_games))
## # A tibble: 1 x 1
##   `sum(seven_games)`
##                <dbl>
## 1              0.246

Long Term Batting Average

  • Long term batting average is just the mean expected value of the distribution
  • Can be solved via simulation
    • Assuming 165 games in a year and ten year career, we can simulate with n=1650
      • This outcome is likely to differ from theoretical distribution considering sample size
      • .328
  • Strictly using distribution function, function seems to break
    • I got rid of graph and solved programatically
    • expected batting average .325

Solve Using Simulation

## simulation
set.seed(5)
batting_avg_longterm <- sample(0:4, 365, our_probability, replace = TRUE)
sum(batting_avg_longterm)/(length(batting_avg_longterm) * 4)
## [1] 0.3267123

Solve With Distribution Function

# Solution A
total_career_hit_distribution <- my_n_fold_convolution(s = 1650, prob = our_probability, 
    my_labels, 0, 4)
my_df <- as_data_frame(cbind(total_career_hit_distribution, (0:6600)))
## expected value of hits at .500 of distribution
sum(my_df$total_career_hit_distribution[0:2145])
## [1] 0.4974554
## 2145 hits/ 6600 total ab
2145/6600
## [1] 0.325
# Alternative solution
my_cdf <- dplyr::as_data_frame(cbind(cumsum(my_df$total_career_hit_distribution), 
    my_df$V1))
my_cdf %>% filter(V1 > 0.5 & V1 < 0.51)
## # A tibble: 1 x 2
##      V1    V2
##   <dbl> <dbl>
## 1 0.505  2145