vcBdayProblem.R

lbraswel — Nov 20, 2012, 4:59 PM

# Lance Braswell (@LanceBraswell)
# http://www.linkedin.com/in/lancebraswell
#
options(warn=-1)
library(gridExtra)
Loading required package: grid
library(ggplot2)
library(scales)
library(gplots)
Loading required package: gtools
Loading required package: gdata
gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.

gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
Attaching package: 'gdata'
The following object(s) are masked from 'package:stats':

nobs
The following object(s) are masked from 'package:utils':

object.size
Loading required package: caTools
Loading required package: bitops
Loading required package: KernSmooth
KernSmooth 2.23 loaded Copyright M. P. Wand 1997-2009
Loading required package: MASS
Attaching package: 'gplots'
The following object(s) are masked from 'package:stats':

lowess

# These are trivial calculations but I am learning R so it was a
# convenient mini-project to use

# Background:
# http://kb.vmware.com/selfservice/microsites/search.do?language=en_US&cmd=displayKC&externalId=1024025
# http://en.wikipedia.org/wiki/Birthday_problem

## Part 1: VMware VC instance.id Conflicts:
#
# In this section, we want to examine that given VMware 
# makes a random choice VCenter instance.id in the range [0,63], what
# is the probability that an instance.id is duplicated versus the
# number of VCenters so deployed. This is variation of the famous Birthday
# Problem/Paradox. Paradoxical because it's actually more likely than
# one would expect with intuition that given a group of N people,
# what is probability that two people share the same birthday. For the
# original birthday problem, with N=23 there is a better than even chance
# (50%) two people have the same birthday.

# R implementation of info from http://en.wikipedia.org/wiki/Birthday_problem
birthdayApprox <- function (items, slots) {
  round(1-exp(-(items*(items-1))/(2*slots)),4)
}

birthdayExact <- function (items, slots) {
  round(1-(factorial(items) * choose(slots, items)/(slots ^ items)),4)
}

birthdaySimulated <- function (items, slots, runs=100) {
    occ <- sum(sapply(1:runs, function(x) {
                          if (T %in% duplicated(sample(1:slots,items, replace=T))) {1} else {0}
                        }))
    occ/runs
}

# We now apply the birthday problem with 64 slots instead of 365.
# VMware assigns a random instance.id in the range [0,63]
# shift it by 1 to make the math easier
id.slots <- 64
# See what happens with VC count in the range [1,id.slots]
vc.count <- c(1:id.slots)

# using the approximation
p.conflict.approx <- sapply(vc.count, function (x) { birthdayApprox(x,id.slots)} )
p.conflict.exact <- sapply(vc.count, function (x) { birthdayExact(x,id.slots)} )
p.conflict.simulated <- sapply(vc.count, function (x) { birthdaySimulated(x,id.slots)} )
InstanceIDdf <- data.frame(vc.count,p.conflict.approx,p.conflict.exact,p.conflict.simulated)
vc.count.50p <- as.integer(InstanceIDdf[ InstanceIDdf$p.conflict.exact >= 0.5,][1,1])

print(paste("The number of VCs required to produce a better than 50% chance of an InstanceID conflict: ", vc.count.50p))
[1] "The number of VCs required to produce a better than 50% chance of an InstanceID conflict:  10"
VCgraph <- ggplot(InstanceIDdf, aes(x=vc.count)) +
  geom_point(aes(y = p.conflict.exact, colour = "Calculated")) +
  geom_point(aes(y = p.conflict.simulated, colour = "Simulated")) +
  labs(color="Method") +
  geom_vline(xintercept = vc.count.50p) +
  scale_x_continuous(breaks=c(seq(0,70,10)),labels = comma) +
  ylab("Probability of InstanceID Conflict") +
  xlab("Number of VCs deployed") +
  ggtitle("Probability of Randomly assigned InstanceID conflict v. VC count")

textplot(InstanceIDdf[InstanceIDdf$vc.count < 38,])

plot of chunk unnamed-chunk-1

VCgraph

plot of chunk unnamed-chunk-1


#png(filename = "vcbdyprob.png", width = 1100, height = 400)
#dev.off()

## Part 2: VM MAC conflicts
#
# Suppose we have 2 VC with the same instance.id 
# and VMware MAC addresses outside of the designated
# manual range: 00:50:56:00:00:00 - 00:50:56:3F:FF:FF
# This would seem to leave the auto-assigned range: 
# 00:50:56:40:00:00 - 00:50:56:FF:FF:FF
# or 12 * 16 + 2^8 *2^8 possible MAC choices. Assuming VMware bakes in the 64
# possible VC ids one for one into the MAC addresses, this leaves:
mac.slots = 12*16 - 64 + (2^8)*(2^8) 
# or 65664 available MAC address slots under each VC with
# a given instance.id

# Now suppose we are to deploy N number of VMs in the range of [1,2*slots]
# evenly split across two VCs, A and B with the same instance.id. Let's assume 
# each VM deployed gets one NetworkAdapter with a MAC in the auto-assigned range
# from the section above.
# Let's calculate the number of VMs that would have conflicting
# MAC addresses:
# Each VC will get N/2 VMs. Given N/2 VM deployed on A, each VM deployed on B
# has an N/2/slots chance of having a conflicting MAC address. The excepted number 
# of conflicts on B is then (N/2) * (N/2/slots). Since a conflict on B is a conflict A,
# the total number of VMs with a MAC address conflict is twice the number found on B:
# 2 * (N/2) * (N/2/slots)
# More generally, in the case of non-symetric deployment of VMs across two VCs A ad B,
# the expected number of conflicting VM MACs can also be calculated:
# Given N = n + m VMs deployed with n VMs on A and m VMs on B and n,m < slots
# the number of MAC address conflicts between the two VC is:
# 2 * (n*m)/slots
# which reduces to the above in the case of m = n = N/2
# (Note that the above arguments are also true if you consider VM NetworkAdapters
# deployed as oppsoed to just VMs).
expectedMACConflicts <- function (slots, n, m) {
  if ((n > slots) || (m > slots)) stop("n, m should be <= slots")
  as.integer(2 * (n * m) / slots)
}

# We can also simulate this by taking the mean number of MAC 
# overlaps (intersection) in 100 sampling runs given n VMs on A and m VMs
# on B (actually order doesn't matter) where n,m <= slots
expectedMACConflictsSimulated <- function (slots, n, m, runs=100) {
  if ((n > slots) || (m > slots)) stop("n, m should be <= slots")
  n <- as.integer(n)
  m <- as.integer(m)
  # Read the next line as: Sample across the two ranges. Get the intersection of the two ranges.
  # Count that number. Do it a total of $runs number of times. Take the mean of those results.
  avgConflicts <- mean(sapply(1:runs,function(x){
                            length(intersect(sample(1:slots,n,replace=F),
                                             sample(1:slots,m,replace=F)))
                          }))
  as.integer(2 * avgConflicts)
}

# Let's plot out the results by sampling deployment of up to 2*slots VMs evenly split
vm.count <- c(seq(10,100,by=10),seq(200,1000,by=100),seq(2000,20000,by=1000),
              seq(30000,mac.slots*2,by=10000),mac.slots*2)
# Here we a splitting evenly across two VCs hence the m,n = x/2:
expected.MACConflicts.calc <- sapply(vm.count,
                                     function(x) {expectedMACConflicts(mac.slots,x/2,x/2)})
expected.MACConflicts.sim  <- sapply(vm.count,
                                     function(x) {expectedMACConflictsSimulated(mac.slots,x/2,x/2)})

MacConflictdf <- data.frame(vm.count,expected.MACConflicts.calc,expected.MACConflicts.sim)
textplot(MacConflictdf)

plot of chunk unnamed-chunk-1


MACgraphAll <- ggplot(MacConflictdf, aes(vm.count)) +
  geom_point(aes(y = expected.MACConflicts.calc, colour= "Calculated")) +
  geom_point(aes(y = expected.MACConflicts.sim, colour = "Simulated")) +
  labs(color="Method") +
  scale_y_sqrt(limits=c(0,2*mac.slots),breaks=c(seq(0,2*mac.slots,20000)),labels = comma) +
  scale_x_continuous(limits=c(0,2*mac.slots),breaks=c(seq(0,2*mac.slots,20000)),labels = comma) +
  xlab("Total VMs deployed (evenly)") +
  ylab(expression(sqrt("VMs with MAC conflicts"))) +
  ggtitle("Expected VM MAC conflicts versus VMs deployed evenly\n across 2 VCs with the same InstanceID")

MACgraphFirst10K <- ggplot(MacConflictdf, aes(vm.count)) +
  geom_point(aes(y = expected.MACConflicts.calc, colour = "Calculated")) +
  geom_point(aes(y = expected.MACConflicts.sim, colour = "Simulated")) +
  labs(color="Method") +
  scale_y_continuous(limits=c(0,1000),labels = comma) +
  scale_x_continuous(limits=c(0,10000),labels = comma) +
  xlab("Total VMs deployed (evenly)") +
  ylab("VMs with MAC conflicts") +
  ggtitle("Expected VM MAC conficts versus VMs deployed evenly\nacross 2 VCs with the same InstanceID (first 10K VMs)")

#png(filename = "vcbdyprob.png", width = 1100, height = 400)
grid.arrange(MACgraphAll,MACgraphFirst10K, nrow=2, ncol=1)
Warning: Removed 22 rows containing missing values (geom_point).
Warning: Removed 22 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-1

#dev.off()