library(tidyverse)
library(dplyr)
library(knitr)

Build A nfoldconvolution Function

Being that the dice example is easier to visualize, I use it to figure out how to program a Nfoldconvolution function
Our space \(S_2=X_1+X_2\)

# X1
x_1 <- c(rep(1/6, 6))
# x2
x_2 <- c(rep(1/6, 6))
# S2=x1+x2
s_2 <- convolve(x_1, x_2, type = "open")
s_2

##  [1] 0.02777778 0.05555556 0.08333333 0.11111111 0.13888889 0.16666667
##  [7] 0.13888889 0.11111111 0.08333333 0.05555556 0.02777778

Sanity and Visual Check

\(S_2(2)= \frac{1}{6}*\frac{1}{6}\)
\(S_2(12)= \frac{1}{6}*\frac{1}{6}\)

## sanity check sum 2, 12 should equal 1/6*1/6
dice_add_to_2 <- s_2[1]
dice_add_to_12 <- s_2[11]
round(dice_add_to_2, 2) == round(dice_add_to_12, 2)

## [1] TRUE

round(1/36, 2) == round(dice_add_to_2, 2)

## [1] TRUE

## visual check
s_2_dice <- sample(2:12, 50000, s_2, replace = TRUE)
hist(s_2_dice)

Build s3

\(S_3= x_3+s_2\)

# s3
s_3 <- convolve(s_2,rev(x_2), type = "open")
hist(sample(3:18, 50000, s_3, replace = TRUE))

Build n_fold_convolution

Function only works with discrete continuous integers as data
- Dispalys a simulation of results
- Calculates and returns actual probability distribution

my_n_fold_convolution <- function(s, prob, labels, start, stop)
{
    fold <- 2
    s_distribution <- prob
    while (fold <= s)
    {
        s_distribution <- convolve(prob, rev(s_distribution), conj = TRUE, 
            type = "open")
        fold <- fold + 1
    }
    labels_start <- s * start
    label_end <- s * stop
    my_labels <- labels_start:label_end
    hist(sample(my_labels, 2e+05, s_distribution, replace = TRUE), main = "histogram of simulation")
    return(s_distribution)
}

Test Function

Tested on 2 and 3 everything seems fine

dice <- 1:6
my_s2 <- my_n_fold_convolution(2, x_2, dice, 1, 6)

my_s3 <- my_n_fold_convolution(3, x_2, dice, 1, 6)

kable(as_data_frame(cbind(my_s3, s3 = s_3 ,my_s2, s2 = s_2)))

my_s3	s3	my_s2	s2
0.0046296	0.0046296	0.0277778	0.0277778
0.0138889	0.0138889	0.0555556	0.0555556
0.0277778	0.0277778	0.0833333	0.0833333
0.0462963	0.0462963	0.1111111	0.1111111
0.0694444	0.0694444	0.1388889	0.1388889
0.0972222	0.0972222	0.1666667	0.1666667
0.1157407	0.1157407	0.1388889	0.1388889
0.1250000	0.1250000	0.1111111	0.1111111
0.1250000	0.1250000	0.0833333	0.0833333
0.1157407	0.1157407	0.0555556	0.0555556
0.0972222	0.0972222	0.0277778	0.0277778
0.0694444	0.0694444	0.0277778	0.0277778
0.0462963	0.0462963	0.0555556	0.0555556
0.0277778	0.0277778	0.0833333	0.0833333
0.0138889	0.0138889	0.1111111	0.1111111
0.0046296	0.0046296	0.1388889	0.1388889

Problem 7.4

Solve problem

Amount of winnings is (1,2,3)
I want s(10)
probability= (1/4,1/4,1/2)
start=1
end=3

winnings_dist <- my_n_fold_convolution(s = 10, prob = c(1/4, 1/4, 1/2), 
    labels = c(1, 2, 3), start = 1, stop = 3)

winnings_dist

##  [1] 9.536743e-07 9.536743e-06 6.198883e-05 2.861023e-04 1.058578e-03
##  [6] 3.215790e-03 8.325577e-03 1.853943e-02 3.609180e-02 6.157875e-02
## [11] 9.277821e-02 1.231575e-01 1.443672e-01 1.483154e-01 1.332092e-01
## [16] 1.029053e-01 6.774902e-02 3.662109e-02 1.586914e-02 4.882813e-03
## [21] 9.765625e-04

Conclusion

Originally i didnt want to do this problem, and I wanted to attempt the baseball question, question 8.
Let’s try question 8

Problem 7.8

our_probability <- c(0.4, 0.2, 0.2, 0.1, 0.1)
my_labels <- 0:4
start <- 0
end <- 4

Histograms of simulations for 4,5,6,7 games

four_games <- my_n_fold_convolution(s = 4, prob = our_probability, labels = my_labels, 
    start = 0, stop =  4)

five_games <- my_n_fold_convolution(s = 5, prob = our_probability, labels = my_labels, 
    start = 0, stop = 4)

six_games <- my_n_fold_convolution(s = 6, prob = our_probability, labels = my_labels, 
    start = 0, stop = 4)

seven_games <- my_n_fold_convolution(s = 7 , prob = our_probability, labels = my_labels, 
    start = 0, stop = 4)

Hit Distributions For 4 game series

expected_hits_fourgames <- as_data_frame(cbind(four_games, hits = start:(end * 
    4)))
kable(expected_hits_fourgames)

four_games	hits
0.0256	0
0.0512	1
0.0896	2
0.1152	3
0.1424	4
0.1408	5
0.1312	6
0.1056	7
0.0808	8
0.0528	9
0.0328	10
0.0176	11
0.0089	12
0.0036	13
0.0014	14
0.0004	15
0.0001	16

Quick Sanity Check

Above table shows 0 hits as .0256
- should be .4 **4
Above table shows 16 hits as .0001
- should be .1*4

print(0.4^4)

## [1] 0.0256

print(0.1^4)

## [1] 1e-04

Hit Distributions For 5,6,7 Games

# 5 games
expected_hits_fivegames <- as_data_frame(cbind(five_games, hits = start:(end * 
    5)))
kable(expected_hits_fivegames)

five_games	hits
0.01024	0
0.02560	1
0.05120	2
0.07680	3
0.10560	4
0.12192	5
0.12960	6
0.12240	7
0.10800	8
0.08560	9
0.06352	10
0.04280	11
0.02700	12
0.01530	13
0.00810	14
0.00381	15
0.00165	16
0.00060	17
0.00020	18
0.00005	19
0.00001	20

# 6 games
expected_hits_sixgames <- as_data_frame(cbind(six_games, hits = start:(end * 
    6)))
kable(expected_hits_sixgames)

six_games	hits
0.004096	0
0.012288	1
0.027648	2
0.047104	3
0.071424	4
0.092928	5
0.110144	6
0.117504	7
0.116352	8
0.105472	9
0.089328	10
0.069984	11
0.051424	12
0.034992	13
0.022332	14
0.013184	15
0.007272	16
0.003672	17
0.001721	18
0.000726	19
0.000279	20
0.000092	21
0.000027	22
0.000006	23
0.000001	24

# 7 games
expected_hits_sevengames <- as_data_frame(cbind(seven_games, hits = start:(end * 
    7)))
kable(expected_hits_sevengames)

seven_games	hits
0.0016384	0
0.0057344	1
0.0143360	2
0.0272384	3
0.0451584	4
0.0648704	5
0.0844032	6
0.0994688	7
0.1085056	8
0.1092672	9
0.1028608	10
0.0903392	11
0.0746144	12
0.0577584	13
0.0421472	14
0.0288792	15
0.0186536	16
0.0112924	17
0.0064288	18
0.0034146	19
0.0016954	20
0.0007771	21
0.0003297	22
0.0001267	23
0.0000441	24
0.0000133	25
0.0000035	26
0.0000007	27
0.0000001	28

B batting average above .400

To calculate the batting average above .400 we take total probability given a players \(\frac{hits}{ab}>.400\)
- Below we first calculate how many hits a player needs in each series length to bat 400 and above
- Then we find the sum of the probability of getting that hit amount and above
Answer
- 4 games = 0.304
- 5 games = .357
- 6 games = .295
- 7 games = .246
- The reason for the large variety in outcomes is that our data is discrete and therefore depending on how many at bats occur, the amount of hits required to reach .400 varies.
  - In 5 games(highest probability),8 hiss are required to reach .400, 8/20 = .400
  - In 7 games(lowest probability), 12 hits are required to reach .400, 12/28 = .428

expected_hits_fourgames

## # A tibble: 17 x 2
##    four_games  hits
##         <dbl> <dbl>
##  1  0.0256        0
##  2  0.0512        1
##  3  0.0896        2
##  4  0.115         3
##  5  0.142         4
##  6  0.141         5
##  7  0.131         6
##  8  0.106         7
##  9  0.0808        8
## 10  0.0528        9
## 11  0.0328       10
## 12  0.0176       11
## 13  0.00890      12
## 14  0.00360      13
## 15  0.00140      14
## 16  0.000400     15
## 17  0.0001000    16

# calculate hits needed 4 games
four_gamesba <- 7/16
# filter and summarize
expected_hits_fourgames %>% filter(hits >= 7) %>% summarize(sum(four_games))

## # A tibble: 1 x 1
##   `sum(four_games)`
##               <dbl>
## 1             0.304

# calculate hits needed 5 games
five_gamesba <- 8/20
# filter and summarize
expected_hits_fivegames %>% filter(hits >= 8) %>% summarize(sum(five_games))

## # A tibble: 1 x 1
##   `sum(five_games)`
##               <dbl>
## 1             0.357

# calculate hits needed 6 games
six_gamesba <- 10/24
# filter and summarize
expected_hits_sixgames %>% filter(hits >= 10) %>% summarize(sum(six_games))

## # A tibble: 1 x 1
##   `sum(six_games)`
##              <dbl>
## 1            0.295

# calculate hits needed 7 games
seven_gamesba <- 12/28
# filter and summarize
expected_hits_sevengames %>% filter(hits >= 12) %>% summarize(sum(seven_games))

## # A tibble: 1 x 1
##   `sum(seven_games)`
##                <dbl>
## 1              0.246

Long Term Batting Average

Long term batting average is just the mean expected value of the distribution
Can be solved via simulation
- Assuming 165 games in a year and ten year career, we can simulate with n=1650
  - This outcome is likely to differ from theoretical distribution considering sample size
  - .328
Strictly using distribution function, function seems to break
- I got rid of graph and solved programatically
- expected batting average .325

Solve Using Simulation

## simulation
set.seed(5)
batting_avg_longterm <- sample(0:4, 365, our_probability, replace = TRUE)
sum(batting_avg_longterm)/(length(batting_avg_longterm) * 4)

## [1] 0.3267123

Solve With Distribution Function

# Solution A
total_career_hit_distribution <- my_n_fold_convolution(s = 1650, prob = our_probability, 
    my_labels, 0, 4)
my_df <- as_data_frame(cbind(total_career_hit_distribution, (0:6600)))
## expected value of hits at .500 of distribution
sum(my_df$total_career_hit_distribution[0:2145])

## [1] 0.4974554

## 2145 hits/ 6600 total ab
2145/6600

## [1] 0.325

# Alternative solution
my_cdf <- dplyr::as_data_frame(cbind(cumsum(my_df$total_career_hit_distribution), 
    my_df$V1))
my_cdf %>% filter(V1 > 0.5 & V1 < 0.51)

## # A tibble: 1 x 2
##      V1    V2
##   <dbl> <dbl>
## 1 0.505  2145

Untitled

Justin Herman

October 17, 2018

Build A nfoldconvolution Function

Sanity and Visual Check

Build s3

Build n_fold_convolution

Test Function

Problem 7.4

Solve problem

Conclusion

Problem 7.8

Histograms of simulations for 4,5,6,7 games

Hit Distributions For 4 game series

Quick Sanity Check

Hit Distributions For 5,6,7 Games

B batting average above .400

Long Term Batting Average

Solve Using Simulation

Solve With Distribution Function