library(magrittr)
library(kableExtra)
library(kableExtra)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:kableExtra':
##
## group_rows
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reshape2)
Rolling 2 dice and picking the number [2-12], let the X a random variable of the sum of both dice. The Pr(X=7) has the maximum probability of the sum of both dice. Followed by Pr(X=6) or Pr(X=8).
dice_sim = function(d, nreps) {
rolls = matrix(sample(6, size = d * nreps, replace = T), ncol = nreps)
totals<-colSums(rolls)
return(totals)
}
# simulate dice rolls.
dice_roll = function(d, nreps) {
rolls = matrix(sample(6, size = d * nreps, replace = T), ncol = nreps)
return(rolls)
}
totals<-dice_sim(2, 1e5)
probs<-as.data.frame(prop.table(table(totals)))
probs$expected<-c(1,2,3,4,5,6,5,4,3,2,1)/36
colnames(probs)[which(colnames(probs)=='Freq')]<-'simulated'
ggplot(probs,aes(x=totals,y=simulated))+geom_bar(stat='identity')
print(probs[order(probs$simulated,decreasing=TRUE),])
## totals simulated expected
## 6 7 0.16656 0.16666667
## 7 8 0.14043 0.13888889
## 5 6 0.13963 0.13888889
## 4 5 0.11125 0.11111111
## 8 9 0.10869 0.11111111
## 9 10 0.08430 0.08333333
## 3 4 0.08254 0.08333333
## 2 3 0.05592 0.05555556
## 10 11 0.05545 0.05555556
## 1 2 0.02809 0.02777778
## 11 12 0.02714 0.02777778
dice_2<-t(dice_roll(2,1e5))
doubleRoll<-apply(dice_2,1,function(x) sum(duplicated(x)))
prop.table(table(doubleRoll)) %>%
kable() %>%
kable_styling()
| doubleRoll | Freq |
|---|---|
| 0 | 0.833 |
| 1 | 0.167 |
dice_2<-NULL
doubleRoll<-NULL
totals<-NULL
probs<-NULL
We developed a method to simulate and construct a list of possible pairs which can be used for computing probabilities of (i,j,k) selections in this list.
# 4 dice rollled 100,000 times
n=10000
roll4<-t(dice_roll(4, n))
#all possible combinations of 3 triplets [2-12].
ijk<-combn(seq(2,12),3)
### compute the probability of observing i,j,k in all possible pairings.
# for the roll simulation we comute a 2 committee with 2 seats (12:34; 13:24; 14;23)
set.seed(952020)
randomPair<-function(roll4,index=1){
# roll4 is a matrix of 4 die simulation
# index the ith simulation of 4 die.
# we select possible pairs to avoid duplicate numbers rolled.
first<-roll4[index,]
# randomly assign indices
idx<-sample(seq(1,4),replace=F)
first<-data.frame(first,idx=idx)
# randomly select first select the 1st toss
first.seed<-sample(first$idx,1,replace=F)
# of 3 choose 1 pair, complete the first pair.
a.team<-combn(first$idx[which(first$idx!=first.seed)],1)
# all possible first pairs indices.
pair1<-rbind(first.seed,a.team)
# of the remaining 2, automatically selected as pair.
pair2<-apply(pair1,2,function(x)first$idx[which(first$idx!=x[1] & first$idx!=x[2])] )
# sum the first pair.
pair1.sum<-colSums(apply(pair1,2,function(x) first$first[ match(x,first$idx)]))
# sum of 2nd pair.
pair2.sum<-colSums(apply(pair2,2,function(x) first$first[ match(x,first$idx)]))
# sum of both pairs of all possible combo
pairSums<-(rbind(pair1.sum,pair2.sum))
colnames(pairSums)<-paste0("combn_",seq(1,3))
return(pairSums)
}
## create array of all possible sums of 2 dice.
pairList<-lapply(1:nrow(roll4),function(x) randomPair(roll4,index=x) )
# n simulations, compute the naive prob. that the ijk is in one possible pair.
#create names for the pair list
names(pairList)<-apply(roll4,1,function(x) paste(x,sep="",collapse=","))
## save
# save(pairList,file="~/Documents/school/PM522A-Fall2020/lectures/pairList.RData")
print(roll4[1,]) %>%
kable() %>%
kable_styling()
## [1] 3 3 2 3
| x |
|---|
| 3 |
| 3 |
| 2 |
| 3 |
pairList[[1]] %>%
kable() %>%
kable_styling()
| combn_1 | combn_2 | combn_3 | |
|---|---|---|---|
| pair1.sum | 6 | 5 | 6 |
| pair2.sum | 5 | 6 | 5 |
Pseudo algorithm:
For each \(R_i\), containing the 3 possible sums of two pairs of \(i^{th}\) simulated roll, we compute the total count that the (i,j,k) was present in any combination. This is the total count of (i,j,k) is in all pairs given \(R_i\).
we then compute the marginal probability by averaging across all \(R_i\). This gives the estimated probability of (i,j,k) being in any 3 possible pairings of a typical roll.
The final probability then averages across the 3 combinations of pairs.
The estimated probability is the grand average across all simulated rolls, which measures the estimated probability of any possible combinations of pairing sums which contain any (i,j,k).
This model computed that at least one of (6,7,8) is present in any combination of pairs for a typical roll with probability 0.89, and (2,3,12) is present in any possible pair 0.22 in a typical roll of 4 dice.
# need to compute the frequency of finding at least one of ijk in one of the possible dice pair sums.
computeIJK_Probability<-function(i=NULL,pairList){
# we find the probability that the fixed ijk is in any possible combination.
#let A be the event that ijk is in at least one possible combination of pairs of die.
# we do a frequentist approach, where for each roll, we simply count the ijk is in each possible combo.
# we then average across all rolls, and then average across 3 possible combinations.
counts<-t(sapply(pairList,function(x) apply(x,2,function(y) (sum(y %in%i)))))
# average across combinations, the avg. number of (i,j,k) per roll.
combo.avg.count<-mean(colMeans(counts))
finalData<-data.frame(p=combo.avg.count,ijk=paste(i,sep="",collapse=","))
#make sure the probability sums to 1 for the group.
return(finalData)
}
rollFreq<-as.data.frame(prop.table(table(names(pairList))))
naive<-c()
for(i in 1:ncol(ijk)){
naive_i<-computeIJK_Probability(i=ijk[,i],pairList)
naive<-rbind(naive,naive_i)
}
pairProb<-naive[order(naive$p),]
pairProb$ijk<-factor(pairProb$ijk,levels=pairProb$ijk)
ggplot(pairProb,aes(x=ijk,y=p))+geom_point()+theme(axis.text.x=element_text(angle=90,size=11))+xlab("(i,j,k)")+ylab("estimated probability")+coord_flip()+ggtitle("Marginal estimated probability i,j,k given simulated rolls")
## How many trials before first success (geometric) Assuming a geometric distribution, the number of trials for the first success given (6,7,8) is 1.12, and the number of trials for the first success of (2,11,12) is approximately 4.6.
pairProb$geometric<-1/pairProb$p
ggplot(pairProb,aes(x=ijk,y=geometric))+geom_point()+theme(axis.text.x=element_text(angle=90,size=11))+xlab("(i,j,k)")+ylab("estimated number of trials for first success")+coord_flip()+ggtitle("Waiting time until 1st success given i,j,k")
Assuming a geometric distribution, the number of trials for the \(3^{rd}\) success given any 7 approximately 4.4 failures until 3rd success choosing a 7 and not a 2. The number of trials for the \(3^{rd}\) success of any 2 (and not containing a 7) is approximately 7.73 failures.
Conclusion: The interesting part, is that for 13 successes, the expected number of trials with any 7 is roughly 19-24 failures depending on whether the sum contains a 2 or not. This is far greater than the expected waiting time of 3 2’s.
pairProb$nb.3<-3/pairProb$p
ggplot(pairProb,aes(x=ijk,y=nb.3))+geom_point()+theme(axis.text.x=element_text(angle=90,size=11))+xlab("(i,j,k)")+ylab("estimated number of trials for 3rd success")+coord_flip()+ggtitle("Waiting time until 3rd success given i,j,k")
pairProb$contains2<-'NO'
pairProb[which(sapply(strsplit(as.character(pairProb$ijk),","),function(x) any(x=='2'))),'contains2']<-'YES'
pairProb%>%group_by(contains2)%>%summarise(time.until.3.success=mean(nb.3))%>%data.frame
## `summarise()` ungrouping output (override with `.groups` argument)
## contains2 time.until.3.success
## 1 NO 5.436517
## 2 YES 7.249215
pairProb$nb.13<-13/pairProb$p
ggplot(pairProb,aes(x=ijk,y=nb.13))+geom_point()+theme(axis.text.x=element_text(angle=90,size=11))+xlab("(i,j,k)")+ylab("estimated number of trials for 13th success")+coord_flip()+ggtitle("Waiting time until 13th success given i,j,k")
pairProb$contains7<-'NO'
pairProb[which(sapply(strsplit(as.character(pairProb$ijk),","),function(x) any(x=='7'))),'contains7']<-'YES'
pairProb%>%group_by(contains7)%>%summarise(time.until.13.success=mean(nb.13))%>%data.frame
## `summarise()` ungrouping output (override with `.groups` argument)
## contains7 time.until.13.success
## 1 NO 27.87323
## 2 YES 19.90662
pairProb%>%group_by(contains2,contains7)%>%summarise(time.to.13=mean(nb.13),time.to.3=mean(nb.3))%>%data.frame
## `summarise()` regrouping output by 'contains2' (override with `.groups` argument)
## contains2 contains7 time.to.13 time.to.3
## 1 NO NO 25.46442 5.876405
## 2 NO YES 19.11049 4.410113
## 3 YES NO 33.49380 7.729338
## 4 YES YES 23.09113 5.328723
As a quality check on the naive probability computation, I compute the naive probability of a single selection from [2-12] in the pairs of sums. For example, if I choose 7, what is the marginal probability of seeing a 7 in either paired sum?
the goal of this part is to compare the marginal probability computation function to the theoretical values.
We then compare the estimated marginal probability to the theoretical probability of observing a value in 2 independent rolls of a pair of dice. Since we are choosing a single number, and checking if it is in any combination of pairs, the expected probability is equivalent to just rolling 2 dice, twice independently.
As a simple way of checking the simulation and algorithm procedure, is to consider a single selection of 2-12, and computing the probability that one selection is present in at least one pair of dice sums.
We select a single number [2-12], and compute the probability that one selection (not 3), \(i \in [2-12]\), is in any possible pairs of sums.
Conclusion: we see that for a single number the simulated naive probability approximates the probability of observing that sum in two independent trials.
computeNaiveI<-function(pairList,ijk=NULL){
naive<-c()
for(i in 1:ncol(ijk)){
naive_i<-computeIJK_Probability(i=ijk[,i],pairList)
naive<-rbind(naive,naive_i)
}
return(naive)
}
I<-t(data.frame(seq(2,12)))
naiveI<-computeNaiveI(pairList,ijk=I)
###
#naive_34<-computeFrequentistProbabilityPair(i=c(3,4),pairList)
naiveI$half<-naiveI$p/2
colnames(naiveI)[1]<-'simulated'
naiveI$expected<-2*c(1,2,3,4,5,6,5,4,3,2,1)/36
naiveI$ijk<-factor(naiveI$ijk,levels=naiveI$ijk)
melt(naiveI[,-3],id.vars='ijk')%>%ggplot(aes(x=ijk,y=value,shape=variable,color=variable))+geom_point()+ggtitle('probability of a single selection in 2 trials')+xlab("one selection [2-12]")+ylab("Probability")
Using a 5,000 replications, from a raw simulation, we roll the dice until any combination of the pairs equals the sum of 2, and we wait until 3 2’s have been rolled in any combination of pairings. For 3 2’s the expected waiting time is roughly 22-23 rolls.
For 13 7’s the expected waiting time is approximately 20-21 rolls.
The waiting time for 13 7’s matched roughly the expected value of the negative binomial model. However the waiting time until 3 2’s was above 22, which did not match the expectation previously modeled.
simulateWaiting<-function(i=2,j=3,k=4,sumToStop=3){
success2<-0
trials<-0
while(success2<sumToStop){
trials<-trials+1
wait_2<-t(dice_roll(4, 1))
waitList<-lapply(1:nrow(wait_2),function(x) randomPair(wait_2,index=x) )
testing<-any(apply(waitList[[1]],2,function(x) any(x==i | x==j | x==k) ))
if(testing==TRUE){
success2<-success2+1
}
}
results<- data.frame(success=success2,trials=trials,i=i,j=j,k=k,totalSuccess=sumToStop)
return(results)
}
# simulateWaiting(i=2,j=0,k=0,sumToStop=3)
reps<-5000
wait2<-replicate(reps,simulateWaiting(i=0,j=2,k=0,sumToStop=3))
averageWait2<-mean(sapply(seq(1,reps),function(x) wait2[,,x]$trials ))
wait3<-replicate(reps,simulateWaiting(i=3,j=0,k=0,sumToStop=5))
averageWait3<-mean(sapply(seq(1,reps),function(x) wait3[,,x]$trials ))
wait4<-replicate(reps,simulateWaiting(i=0,j=4,k=0,sumToStop=7))
averageWait4<-mean(sapply(seq(1,reps),function(x) wait4[,,x]$trials ))
wait5<-replicate(reps,simulateWaiting(i=0,j=5,k=0,sumToStop=9))
averageWait5<-mean(sapply(seq(1,reps),function(x) wait5[,,x]$trials ))
wait6<-replicate(reps,simulateWaiting(i=0,j=6,k=0,sumToStop=11))
averageWait6<-mean(sapply(seq(1,reps),function(x) wait6[,,x]$trials ))
wait7<-replicate(reps,simulateWaiting(i=7,j=0,k=0,sumToStop=13))
averageWait7<-mean(sapply(seq(1,reps),function(x) wait7[,,x]$trials ))
wait8<-replicate(reps,simulateWaiting(i=0,j=8,k=0,sumToStop=11))
averageWait8<-mean(sapply(seq(1,reps),function(x) wait8[,,x]$trials ))
wait9<-replicate(reps,simulateWaiting(i=0,j=9,k=0,sumToStop=9))
averageWait9<-mean(sapply(seq(1,reps),function(x) wait9[,,x]$trials ))
wait10<-replicate(reps,simulateWaiting(i=0,j=10,k=0,sumToStop=7))
averageWait10<-mean(sapply(seq(1,reps),function(x) wait10[,,x]$trials ))
wait11<-replicate(reps,simulateWaiting(i=11,j=0,k=0,sumToStop=5))
averageWait11<-mean(sapply(seq(1,reps),function(x) wait11[,,x]$trials ))
wait12<-replicate(reps,simulateWaiting(i=12,j=0,k=0,sumToStop=3))
averageWait12<-mean(sapply(seq(1,reps),function(x) wait12[,,x]$trials ))
avgW<-data.frame(trials=c(averageWait2,averageWait3,averageWait4,
averageWait5,
averageWait6,
averageWait7,
averageWait8,
averageWait9,
averageWait10,
averageWait11,
averageWait12),
i=seq(2,12))
ggplot(avgW,aes(x=i,y=trials))+geom_point()+scale_x_discrete(name='single selection [2-12]',limits=seq(2,12))+ylab("Expected trials to reach top of column")+ggtitle("(Replicated 5,000) Expected waiting time for one selection")
## Warning: Continuous limits supplied to discrete scale.
## Did you mean `limits = factor(...)` or `scale_*_continuous()`?