Description

This is an analysis of spot prices for 5 Amazon Web Services - EC2 instances. The data can be found at: http://dx.doi.org/10.17632/zcnp5xwvz6.1

Please cite the data as:

Monge, David A. (2018), “Amazon Web Services (AWS) Spot Prices Data 2016”, Mendeley Data, v1 http://dx.doi.org/10.17632/zcnp5xwvz6.1

Functions to load the data

instanceTypes <- c("c3.2xlarge", "m3.2xlarge", "m3.medium", "r3.xlarge", "t2.micro")

  build_path <- function(instanceType) {
    return(c(instanceType, paste("data/spot-prices/2016(Mar07-Jun07)/reduced/", instanceType, "a.csv", sep = "")))
  }
  
  read_csv <- function(tuple) {
    filename <- tuple[2]
    tuple <- c(instanceType = tuple[1], read.csv(filename, header = FALSE, col.names = c("spotPrice", "timestamp")))
    return(tuple)
  }

  build_dataframe <- function(dataframe) {
    return(data.frame(instanceType = dataframe$instanceType, timestamp = dataframe$timestamp, spotPrice = dataframe$spotPrice))
  }
  
  load_instance_prices <- function(instanceTypes) {
    filenames <- lapply(instanceTypes, build_path)
    dataframes <- lapply(filenames, read_csv)
    dataframes <- lapply(dataframes, build_dataframe)
    dataframe <- do.call(rbind, dataframes)
    return(dataframe)
  }

Load spot instance prices

# spot prices
spot <- load_instance_prices(instanceTypes)

# on-demand instances
ondemand <- read.csv("data/instances/instances.csv", header = FALSE, col.names = c("category", "instanceType", "vCPUs", "ECU", "X", "memory", "disk", "onDemandPrice"))

# join
instances <- spot %>% 
  inner_join(ondemand, by = "instanceType")

Average cost savings

instances  %>% 
  group_by(instanceType) %>%
  summarise(meanSpotPrice = mean(spotPrice), onDemandPrice = mean(onDemandPrice)) %>%
  mutate(savingPerc = (onDemandPrice - meanSpotPrice) / onDemandPrice * 100)
## # A tibble: 5 x 4
##   instanceType meanSpotPrice onDemandPrice savingPerc
##   <fct>                <dbl>         <dbl>      <dbl>
## 1 c3.2xlarge         0.115          0.420        72.7
## 2 m3.2xlarge         0.112          0.532        79.0
## 3 m3.medium          0.0118         0.0670       82.4
## 4 r3.xlarge          0.0419         0.333        87.4
## 5 t2.micro           0.00315        0.0130       75.8

Spot prices

Histograms of spot prices.

ggplot(data = spot, aes(x = spotPrice, fill = instanceType)) +
  geom_histogram(bins = 30, color = "darkgray") + 
  scale_x_sqrt() + 
  facet_wrap(~instanceType, scales = "free")

Spot price durations

Histograms of spot price durations.

#instances %>% group_by(instanceType) %>% mutate(duration = seconds(interval(ymd_hms(timestamp), lag(ymd_hms(timestamp)))))

ggplot(data = instances %>% group_by(instanceType) %>% mutate(duration = seconds(interval(ymd_hms(timestamp), lag(ymd_hms(timestamp))))), aes(x = duration, fill = instanceType)) +
  geom_histogram(bins = 30, color = "darkgray") + 
  scale_x_sqrt() + 
  facet_wrap(~instanceType, scales = "free")

Time series

first_third <- ymd_hms("2016-04-07T00:00:00.000Z")
second_third <- ymd_hms("2016-05-07T00:00:00.000Z")
first_two_weeks <- ymd_hms("2016-03-21T00:00:00.000Z")

3 months

# 3 months
ggplot(data = instances %>% group_by(instanceType), aes(x = ymd_hms(timestamp), y = spotPrice)) +
  geom_step(aes(group = 1)) + 
  #geom_abline(slope = 0, intercept = mean("onDemandPrice"), linetype = "dashed", color = "red") +
  geom_vline(xintercept = c(first_third, second_third), linetype = "dotted", color = "blue") +
  facet_grid(instanceType~., scales = "free_y") +
  ggtitle("3 month series")

2 weeks

# 2 weeks
ggplot(data = instances %>% filter(ymd_hms(timestamp) < first_two_weeks) %>% group_by(instanceType), aes(x = ymd_hms(timestamp), y = spotPrice/onDemandPrice)) +
  geom_step(aes(group = 1)) + 
  #geom_abline(slope = 0, intercept = mean("onDemandPrice"), linetype = "dashed", color = "red") +
  geom_vline(xintercept = c(first_third, second_third), linetype = "dotted", color = "blue") +
  facet_grid(instanceType~., scales = "free_y") +
  ggtitle("2 week series")

Series overlap standarized by on-demand price

\(price = spotPrice/onDemandPrice\)

ggplot(data = instances %>% filter(ymd_hms(timestamp) < first_two_weeks), aes(x = ymd_hms(timestamp), y = spotPrice/onDemandPrice)) +
  geom_step(aes(color = instanceType)) + 
  geom_abline(slope = 0, intercept = mean("onDemandPrice"), linetype = "dashed", color = "red")

Series overlap normalized [0, 1]

\(price = (spotPrice - min) / (max - min)\)

normalize <-function(x) {
  x_norm <- (x-min(x)) / (max(x)-min(x))
  return(x_norm)
}

ggplot(data = instances %>% filter(ymd_hms(timestamp) < first_two_weeks) %>% group_by(instanceType) %>% mutate(spotPrice_norm = normalize(spotPrice)), aes(x = ymd_hms(timestamp), y = spotPrice_norm)) +
  geom_step(aes(color = instanceType)) + 
  geom_abline(slope = 0, intercept = mean("onDemandPrice"), linetype = "dashed", color = "red") 

Out of bid (OOB) errors

unique_prices <- instances %>% group_by(instanceType) %>% distinct(spotPrice)

Example

OOB errors for two bids.

selInstanceType <- "c3.2xlarge"
highBid <- 0.1595  # 0.0895
lowBid <- 0.0969

errorColor <- function(price, lowBid, highBid) {
  if (price > lowBid) {
    return("OOB (low bid)")
  } else if (price > highBid) {
    return("OOB (high bid)")
  } else {
    return("no OOB errors")
  }
}

data <- instances %>% 
  filter(instanceType == selInstanceType) %>% 
  group_by(instanceType)


#ggplot(data = data, aes(x = ymd_hms(timestamp), y = spotPrice)) +
#  geom_step(aes(group = 1, color = errorColor(spotPrice, lowBid, highBid))) + 
#  geom_hline(aes(yintercept = mean(onDemandPrice), linetype = "dashed", color = "on-demand price")) + # on-demand price 
#  geom_vline(xintercept = c(first_third, second_third), linetype = "dotted", color = "blue")+ # month ticks
#  geom_hline(aes(yintercept = mean(lowBid), color = "OOB (low bid)")) +
#  geom_hline(aes(yintercept = mean(highBid), color = "OOB (high bid)")) 

ggplot(data = data %>% filter(spotPrice <= lowBid), aes(x = ymd_hms(timestamp), y = spotPrice)) +
  geom_step(aes(group = 1, color = "no OOB errors")) + # no error zone
  geom_step(data = instances %>% filter(spotPrice > lowBid & spotPrice <= highBid), aes(color = "OOB (low bid)")) + # low bid
  geom_hline(aes(yintercept = mean(lowBid), color = "OOB (low bid)")) +
  geom_step(data = instances %>% filter(spotPrice > highBid), aes(color = "OOB (high bid)")) + # high bid
  geom_hline(aes(yintercept = mean(highBid), color = "OOB (high bid)")) +
  geom_hline(aes(yintercept = mean(onDemandPrice), linetype = "dashed", color = "on-demand price")) + # on-demand price 
  geom_vline(xintercept = c(first_third, second_third), linetype = "dotted", color = "blue") # month ticks

OOB error probabilities

Curves represent functions describing the probability of randomly selecting a timestep for an OOB error will occur (spot price overcomes the bid) for a given bid.

Note: for bids below the minimum price in the data the probability will be 1. Conversely for bids greater or equal to the maximum price in the data, the probability will be 0.

bins <- 100
binIds <- seq(0, bins)

# bins
instanceBins <- crossing(instances %>%
  select(instanceType, timestamp) %>%
  mutate(timeRange = as.numeric(seconds(interval(min(ymd_hms(timestamp)), max(ymd_hms(timestamp)))))) %>%
  select(-timestamp) %>%
  unique(), binIds) %>%
  rename(binId = binIds)

# bids
bids <- instances %>%
  group_by(instanceType) %>%
  mutate(maxPrice = max(spotPrice)) %>%
  mutate(minPrice = min(spotPrice)) %>%
  mutate(binSize = (maxPrice - minPrice) / bins) %>%
  select(instanceType, minPrice, maxPrice, binSize) %>%
  unique() %>%
  inner_join(instanceBins, by = "instanceType") %>%
  mutate(bid = minPrice + binSize * binId) %>%
  select(instanceType, binId, bid, timeRange)

# failure probabilities
probabilities <- instances %>% 
  group_by(instanceType) %>%
  mutate(duration = seconds(interval(ymd_hms(timestamp), lag(ymd_hms(timestamp))))) %>%
  filter(!is.na(duration)) %>%
  arrange(timestamp) %>%
  inner_join(bids, by = "instanceType") %>%
  select(instanceType, timestamp, spotPrice, duration, bid, binId, timeRange) %>%
  mutate(probFailure = as.integer(spotPrice > bid) * duration / timeRange) %>%
  group_by(instanceType, binId, bid) %>%
  summarise(probability = sum(probFailure))
  
# joint
ggplot(data = probabilities, aes(x = binId, y = probability, color = instanceType)) +
   geom_line()

# separated
ggplot(data = probabilities, aes(x = bid, y = probability, color = instanceType)) +
   geom_line() +
   facet_wrap(~instanceType, scales = "free_x")

Augmented current price prediction (ACPP)

It is a bidding prediction method that bids in the following as : \(bid = currentPrice \times augment,\) where \(augment\) is the percentage to increment over the current spot price.

Example

augment <- 2.5  # 250%

ggplot(data = instances %>% filter(ymd_hms(timestamp) < first_two_weeks), aes(x = ymd_hms(timestamp), y = spotPrice)) +
  geom_step(aes(color = instanceType)) + 
  geom_step(data = instances %>% filter(ymd_hms(timestamp) < first_two_weeks), aes(x = ymd_hms(timestamp), y = spotPrice * augment, color = I("bid"))) +
  #geom_abline(slope = 0, intercept = mean("onDemandPrice"), linetype = "dashed", color = "red") +
  geom_hline(aes(yintercept = mean(onDemandPrice), linetype = "dashed", color = I("on-demand price"))) + 
  facet_grid(instanceType~., scales = "free_y")