lbraswel — Nov 20, 2012, 4:59 PM
# Lance Braswell (@LanceBraswell)
# http://www.linkedin.com/in/lancebraswell
#
options(warn=-1)
library(gridExtra)
Loading required package: grid
library(ggplot2)
library(scales)
library(gplots)
Loading required package: gtools
Loading required package: gdata
gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
Attaching package: 'gdata'
The following object(s) are masked from 'package:stats':
nobs
The following object(s) are masked from 'package:utils':
object.size
Loading required package: caTools
Loading required package: bitops
Loading required package: KernSmooth
KernSmooth 2.23 loaded Copyright M. P. Wand 1997-2009
Loading required package: MASS
Attaching package: 'gplots'
The following object(s) are masked from 'package:stats':
lowess
# These are trivial calculations but I am learning R so it was a
# convenient mini-project to use
# Background:
# http://kb.vmware.com/selfservice/microsites/search.do?language=en_US&cmd=displayKC&externalId=1024025
# http://en.wikipedia.org/wiki/Birthday_problem
## Part 1: VMware VC instance.id Conflicts:
#
# In this section, we want to examine that given VMware
# makes a random choice VCenter instance.id in the range [0,63], what
# is the probability that an instance.id is duplicated versus the
# number of VCenters so deployed. This is variation of the famous Birthday
# Problem/Paradox. Paradoxical because it's actually more likely than
# one would expect with intuition that given a group of N people,
# what is probability that two people share the same birthday. For the
# original birthday problem, with N=23 there is a better than even chance
# (50%) two people have the same birthday.
# R implementation of info from http://en.wikipedia.org/wiki/Birthday_problem
birthdayApprox <- function (items, slots) {
round(1-exp(-(items*(items-1))/(2*slots)),4)
}
birthdayExact <- function (items, slots) {
round(1-(factorial(items) * choose(slots, items)/(slots ^ items)),4)
}
birthdaySimulated <- function (items, slots, runs=100) {
occ <- sum(sapply(1:runs, function(x) {
if (T %in% duplicated(sample(1:slots,items, replace=T))) {1} else {0}
}))
occ/runs
}
# We now apply the birthday problem with 64 slots instead of 365.
# VMware assigns a random instance.id in the range [0,63]
# shift it by 1 to make the math easier
id.slots <- 64
# See what happens with VC count in the range [1,id.slots]
vc.count <- c(1:id.slots)
# using the approximation
p.conflict.approx <- sapply(vc.count, function (x) { birthdayApprox(x,id.slots)} )
p.conflict.exact <- sapply(vc.count, function (x) { birthdayExact(x,id.slots)} )
p.conflict.simulated <- sapply(vc.count, function (x) { birthdaySimulated(x,id.slots)} )
InstanceIDdf <- data.frame(vc.count,p.conflict.approx,p.conflict.exact,p.conflict.simulated)
vc.count.50p <- as.integer(InstanceIDdf[ InstanceIDdf$p.conflict.exact >= 0.5,][1,1])
print(paste("The number of VCs required to produce a better than 50% chance of an InstanceID conflict: ", vc.count.50p))
[1] "The number of VCs required to produce a better than 50% chance of an InstanceID conflict: 10"
VCgraph <- ggplot(InstanceIDdf, aes(x=vc.count)) +
geom_point(aes(y = p.conflict.exact, colour = "Calculated")) +
geom_point(aes(y = p.conflict.simulated, colour = "Simulated")) +
labs(color="Method") +
geom_vline(xintercept = vc.count.50p) +
scale_x_continuous(breaks=c(seq(0,70,10)),labels = comma) +
ylab("Probability of InstanceID Conflict") +
xlab("Number of VCs deployed") +
ggtitle("Probability of Randomly assigned InstanceID conflict v. VC count")
textplot(InstanceIDdf[InstanceIDdf$vc.count < 38,])
VCgraph
#png(filename = "vcbdyprob.png", width = 1100, height = 400)
#dev.off()
## Part 2: VM MAC conflicts
#
# Suppose we have 2 VC with the same instance.id
# and VMware MAC addresses outside of the designated
# manual range: 00:50:56:00:00:00 - 00:50:56:3F:FF:FF
# This would seem to leave the auto-assigned range:
# 00:50:56:40:00:00 - 00:50:56:FF:FF:FF
# or 12 * 16 + 2^8 *2^8 possible MAC choices. Assuming VMware bakes in the 64
# possible VC ids one for one into the MAC addresses, this leaves:
mac.slots = 12*16 - 64 + (2^8)*(2^8)
# or 65664 available MAC address slots under each VC with
# a given instance.id
# Now suppose we are to deploy N number of VMs in the range of [1,2*slots]
# evenly split across two VCs, A and B with the same instance.id. Let's assume
# each VM deployed gets one NetworkAdapter with a MAC in the auto-assigned range
# from the section above.
# Let's calculate the number of VMs that would have conflicting
# MAC addresses:
# Each VC will get N/2 VMs. Given N/2 VM deployed on A, each VM deployed on B
# has an N/2/slots chance of having a conflicting MAC address. The excepted number
# of conflicts on B is then (N/2) * (N/2/slots). Since a conflict on B is a conflict A,
# the total number of VMs with a MAC address conflict is twice the number found on B:
# 2 * (N/2) * (N/2/slots)
# More generally, in the case of non-symetric deployment of VMs across two VCs A ad B,
# the expected number of conflicting VM MACs can also be calculated:
# Given N = n + m VMs deployed with n VMs on A and m VMs on B and n,m < slots
# the number of MAC address conflicts between the two VC is:
# 2 * (n*m)/slots
# which reduces to the above in the case of m = n = N/2
# (Note that the above arguments are also true if you consider VM NetworkAdapters
# deployed as oppsoed to just VMs).
expectedMACConflicts <- function (slots, n, m) {
if ((n > slots) || (m > slots)) stop("n, m should be <= slots")
as.integer(2 * (n * m) / slots)
}
# We can also simulate this by taking the mean number of MAC
# overlaps (intersection) in 100 sampling runs given n VMs on A and m VMs
# on B (actually order doesn't matter) where n,m <= slots
expectedMACConflictsSimulated <- function (slots, n, m, runs=100) {
if ((n > slots) || (m > slots)) stop("n, m should be <= slots")
n <- as.integer(n)
m <- as.integer(m)
# Read the next line as: Sample across the two ranges. Get the intersection of the two ranges.
# Count that number. Do it a total of $runs number of times. Take the mean of those results.
avgConflicts <- mean(sapply(1:runs,function(x){
length(intersect(sample(1:slots,n,replace=F),
sample(1:slots,m,replace=F)))
}))
as.integer(2 * avgConflicts)
}
# Let's plot out the results by sampling deployment of up to 2*slots VMs evenly split
vm.count <- c(seq(10,100,by=10),seq(200,1000,by=100),seq(2000,20000,by=1000),
seq(30000,mac.slots*2,by=10000),mac.slots*2)
# Here we a splitting evenly across two VCs hence the m,n = x/2:
expected.MACConflicts.calc <- sapply(vm.count,
function(x) {expectedMACConflicts(mac.slots,x/2,x/2)})
expected.MACConflicts.sim <- sapply(vm.count,
function(x) {expectedMACConflictsSimulated(mac.slots,x/2,x/2)})
MacConflictdf <- data.frame(vm.count,expected.MACConflicts.calc,expected.MACConflicts.sim)
textplot(MacConflictdf)
MACgraphAll <- ggplot(MacConflictdf, aes(vm.count)) +
geom_point(aes(y = expected.MACConflicts.calc, colour= "Calculated")) +
geom_point(aes(y = expected.MACConflicts.sim, colour = "Simulated")) +
labs(color="Method") +
scale_y_sqrt(limits=c(0,2*mac.slots),breaks=c(seq(0,2*mac.slots,20000)),labels = comma) +
scale_x_continuous(limits=c(0,2*mac.slots),breaks=c(seq(0,2*mac.slots,20000)),labels = comma) +
xlab("Total VMs deployed (evenly)") +
ylab(expression(sqrt("VMs with MAC conflicts"))) +
ggtitle("Expected VM MAC conflicts versus VMs deployed evenly\n across 2 VCs with the same InstanceID")
MACgraphFirst10K <- ggplot(MacConflictdf, aes(vm.count)) +
geom_point(aes(y = expected.MACConflicts.calc, colour = "Calculated")) +
geom_point(aes(y = expected.MACConflicts.sim, colour = "Simulated")) +
labs(color="Method") +
scale_y_continuous(limits=c(0,1000),labels = comma) +
scale_x_continuous(limits=c(0,10000),labels = comma) +
xlab("Total VMs deployed (evenly)") +
ylab("VMs with MAC conflicts") +
ggtitle("Expected VM MAC conficts versus VMs deployed evenly\nacross 2 VCs with the same InstanceID (first 10K VMs)")
#png(filename = "vcbdyprob.png", width = 1100, height = 400)
grid.arrange(MACgraphAll,MACgraphFirst10K, nrow=2, ncol=1)
Warning: Removed 22 rows containing missing values (geom_point).
Warning: Removed 22 rows containing missing values (geom_point).
#dev.off()