This is an analysis of spot prices for 5 Amazon Web Services - EC2 instances. The data can be found at: http://dx.doi.org/10.17632/zcnp5xwvz6.1
Please cite the data as:
Monge, David A. (2018), “Amazon Web Services (AWS) Spot Prices Data 2016”, Mendeley Data, v1 http://dx.doi.org/10.17632/zcnp5xwvz6.1
instanceTypes <- c("c3.2xlarge", "m3.2xlarge", "m3.medium", "r3.xlarge", "t2.micro")
build_path <- function(instanceType) {
return(c(instanceType, paste("data/spot-prices/2016(Mar07-Jun07)/reduced/", instanceType, "a.csv", sep = "")))
}
read_csv <- function(tuple) {
filename <- tuple[2]
tuple <- c(instanceType = tuple[1], read.csv(filename, header = FALSE, col.names = c("spotPrice", "timestamp")))
return(tuple)
}
build_dataframe <- function(dataframe) {
return(data.frame(instanceType = dataframe$instanceType, timestamp = dataframe$timestamp, spotPrice = dataframe$spotPrice))
}
load_instance_prices <- function(instanceTypes) {
filenames <- lapply(instanceTypes, build_path)
dataframes <- lapply(filenames, read_csv)
dataframes <- lapply(dataframes, build_dataframe)
dataframe <- do.call(rbind, dataframes)
return(dataframe)
}
# spot prices
spot <- load_instance_prices(instanceTypes)
# on-demand instances
ondemand <- read.csv("data/instances/instances.csv", header = FALSE, col.names = c("category", "instanceType", "vCPUs", "ECU", "X", "memory", "disk", "onDemandPrice"))
# join
instances <- spot %>%
inner_join(ondemand, by = "instanceType")
instances %>%
group_by(instanceType) %>%
summarise(meanSpotPrice = mean(spotPrice), onDemandPrice = mean(onDemandPrice)) %>%
mutate(savingPerc = (onDemandPrice - meanSpotPrice) / onDemandPrice * 100)
## # A tibble: 5 x 4
## instanceType meanSpotPrice onDemandPrice savingPerc
## <fct> <dbl> <dbl> <dbl>
## 1 c3.2xlarge 0.115 0.420 72.7
## 2 m3.2xlarge 0.112 0.532 79.0
## 3 m3.medium 0.0118 0.0670 82.4
## 4 r3.xlarge 0.0419 0.333 87.4
## 5 t2.micro 0.00315 0.0130 75.8
Histograms of spot prices.
ggplot(data = spot, aes(x = spotPrice, fill = instanceType)) +
geom_histogram(bins = 30, color = "darkgray") +
scale_x_sqrt() +
facet_wrap(~instanceType, scales = "free")
Histograms of spot price durations.
#instances %>% group_by(instanceType) %>% mutate(duration = seconds(interval(ymd_hms(timestamp), lag(ymd_hms(timestamp)))))
ggplot(data = instances %>% group_by(instanceType) %>% mutate(duration = seconds(interval(ymd_hms(timestamp), lag(ymd_hms(timestamp))))), aes(x = duration, fill = instanceType)) +
geom_histogram(bins = 30, color = "darkgray") +
scale_x_sqrt() +
facet_wrap(~instanceType, scales = "free")
first_third <- ymd_hms("2016-04-07T00:00:00.000Z")
second_third <- ymd_hms("2016-05-07T00:00:00.000Z")
first_two_weeks <- ymd_hms("2016-03-21T00:00:00.000Z")
# 3 months
ggplot(data = instances %>% group_by(instanceType), aes(x = ymd_hms(timestamp), y = spotPrice)) +
geom_step(aes(group = 1)) +
#geom_abline(slope = 0, intercept = mean("onDemandPrice"), linetype = "dashed", color = "red") +
geom_vline(xintercept = c(first_third, second_third), linetype = "dotted", color = "blue") +
facet_grid(instanceType~., scales = "free_y") +
ggtitle("3 month series")
# 2 weeks
ggplot(data = instances %>% filter(ymd_hms(timestamp) < first_two_weeks) %>% group_by(instanceType), aes(x = ymd_hms(timestamp), y = spotPrice/onDemandPrice)) +
geom_step(aes(group = 1)) +
#geom_abline(slope = 0, intercept = mean("onDemandPrice"), linetype = "dashed", color = "red") +
geom_vline(xintercept = c(first_third, second_third), linetype = "dotted", color = "blue") +
facet_grid(instanceType~., scales = "free_y") +
ggtitle("2 week series")
\(price = spotPrice/onDemandPrice\)
ggplot(data = instances %>% filter(ymd_hms(timestamp) < first_two_weeks), aes(x = ymd_hms(timestamp), y = spotPrice/onDemandPrice)) +
geom_step(aes(color = instanceType)) +
geom_abline(slope = 0, intercept = mean("onDemandPrice"), linetype = "dashed", color = "red")
\(price = (spotPrice - min) / (max - min)\)
normalize <-function(x) {
x_norm <- (x-min(x)) / (max(x)-min(x))
return(x_norm)
}
ggplot(data = instances %>% filter(ymd_hms(timestamp) < first_two_weeks) %>% group_by(instanceType) %>% mutate(spotPrice_norm = normalize(spotPrice)), aes(x = ymd_hms(timestamp), y = spotPrice_norm)) +
geom_step(aes(color = instanceType)) +
geom_abline(slope = 0, intercept = mean("onDemandPrice"), linetype = "dashed", color = "red")
unique_prices <- instances %>% group_by(instanceType) %>% distinct(spotPrice)
OOB errors for two bids.
selInstanceType <- "c3.2xlarge"
highBid <- 0.1595 # 0.0895
lowBid <- 0.0969
errorColor <- function(price, lowBid, highBid) {
if (price > lowBid) {
return("OOB (low bid)")
} else if (price > highBid) {
return("OOB (high bid)")
} else {
return("no OOB errors")
}
}
data <- instances %>%
filter(instanceType == selInstanceType) %>%
group_by(instanceType)
#ggplot(data = data, aes(x = ymd_hms(timestamp), y = spotPrice)) +
# geom_step(aes(group = 1, color = errorColor(spotPrice, lowBid, highBid))) +
# geom_hline(aes(yintercept = mean(onDemandPrice), linetype = "dashed", color = "on-demand price")) + # on-demand price
# geom_vline(xintercept = c(first_third, second_third), linetype = "dotted", color = "blue")+ # month ticks
# geom_hline(aes(yintercept = mean(lowBid), color = "OOB (low bid)")) +
# geom_hline(aes(yintercept = mean(highBid), color = "OOB (high bid)"))
ggplot(data = data %>% filter(spotPrice <= lowBid), aes(x = ymd_hms(timestamp), y = spotPrice)) +
geom_step(aes(group = 1, color = "no OOB errors")) + # no error zone
geom_step(data = instances %>% filter(spotPrice > lowBid & spotPrice <= highBid), aes(color = "OOB (low bid)")) + # low bid
geom_hline(aes(yintercept = mean(lowBid), color = "OOB (low bid)")) +
geom_step(data = instances %>% filter(spotPrice > highBid), aes(color = "OOB (high bid)")) + # high bid
geom_hline(aes(yintercept = mean(highBid), color = "OOB (high bid)")) +
geom_hline(aes(yintercept = mean(onDemandPrice), linetype = "dashed", color = "on-demand price")) + # on-demand price
geom_vline(xintercept = c(first_third, second_third), linetype = "dotted", color = "blue") # month ticks
Curves represent functions describing the probability of randomly selecting a timestep for an OOB error will occur (spot price overcomes the bid) for a given bid.
Note: for bids below the minimum price in the data the probability will be 1. Conversely for bids greater or equal to the maximum price in the data, the probability will be 0.
bins <- 100
binIds <- seq(0, bins)
# bins
instanceBins <- crossing(instances %>%
select(instanceType, timestamp) %>%
mutate(timeRange = as.numeric(seconds(interval(min(ymd_hms(timestamp)), max(ymd_hms(timestamp)))))) %>%
select(-timestamp) %>%
unique(), binIds) %>%
rename(binId = binIds)
# bids
bids <- instances %>%
group_by(instanceType) %>%
mutate(maxPrice = max(spotPrice)) %>%
mutate(minPrice = min(spotPrice)) %>%
mutate(binSize = (maxPrice - minPrice) / bins) %>%
select(instanceType, minPrice, maxPrice, binSize) %>%
unique() %>%
inner_join(instanceBins, by = "instanceType") %>%
mutate(bid = minPrice + binSize * binId) %>%
select(instanceType, binId, bid, timeRange)
# failure probabilities
probabilities <- instances %>%
group_by(instanceType) %>%
mutate(duration = seconds(interval(ymd_hms(timestamp), lag(ymd_hms(timestamp))))) %>%
filter(!is.na(duration)) %>%
arrange(timestamp) %>%
inner_join(bids, by = "instanceType") %>%
select(instanceType, timestamp, spotPrice, duration, bid, binId, timeRange) %>%
mutate(probFailure = as.integer(spotPrice > bid) * duration / timeRange) %>%
group_by(instanceType, binId, bid) %>%
summarise(probability = sum(probFailure))
# joint
ggplot(data = probabilities, aes(x = binId, y = probability, color = instanceType)) +
geom_line()
# separated
ggplot(data = probabilities, aes(x = bid, y = probability, color = instanceType)) +
geom_line() +
facet_wrap(~instanceType, scales = "free_x")
It is a bidding prediction method that bids in the following as : \(bid = currentPrice \times augment,\) where \(augment\) is the percentage to increment over the current spot price.
augment <- 2.5 # 250%
ggplot(data = instances %>% filter(ymd_hms(timestamp) < first_two_weeks), aes(x = ymd_hms(timestamp), y = spotPrice)) +
geom_step(aes(color = instanceType)) +
geom_step(data = instances %>% filter(ymd_hms(timestamp) < first_two_weeks), aes(x = ymd_hms(timestamp), y = spotPrice * augment, color = I("bid"))) +
#geom_abline(slope = 0, intercept = mean("onDemandPrice"), linetype = "dashed", color = "red") +
geom_hline(aes(yintercept = mean(onDemandPrice), linetype = "dashed", color = I("on-demand price"))) +
facet_grid(instanceType~., scales = "free_y")