Data Aggregation and K-means example

A quick reproducible example of summarizing data and K-means clustering on a randomly generated gaming dataset. Steps in the analysis are mostly commented.

Aggregating Data

Start by setting up player data frame.

# load packages
library(dplyr)
library(lubridate)
library(ggplot2)

# set seed for reproducibility
set.seed(234)

# set up player IDs
player.ids <- c(LETTERS[1:20])

# set up random start dates
start.dates <- as.Date("2014-01-01") + as.integer(runif(20, 1, 365))

# create data frame
player <- data.frame(player_id = player.ids,
                     start_date = start.dates,
                     stringsAsFactors = FALSE)

# show first six rows of player data frame
head(player)

##   player_id start_date
## 1         A 2014-09-30
## 2         B 2014-10-13
## 3         C 2014-01-09
## 4         D 2014-10-11
## 5         E 2014-01-26
## 6         F 2014-08-24

Next create a transactions data frame.

# create vector with random number (between 1 and 10) of transactions per player
transacts <- as.integer(runif(20, 1, 10))

# create records of transactions for each of the 20 players with transaction amounts
# between 100 and 1000
i <- 1
for (i in 1:length(unique(player$player_id))){
  transact_temp <- data.frame(player_id = rep(player$player_id[i], transacts[i]),
                           transact_date = player$start_date[i] + runif(transacts[i], 1, 365),
                           transact_amount = round(runif(transacts[i], 100, 1000), 2),
                           stringsAsFactors = FALSE)
  if (i == 1){
    transactions <- transact_temp
  }
  else {
    transactions <- bind_rows(transactions, transact_temp)
  }
}

# clean up
rm(i, transact_temp, transacts, player.ids, start.dates)

# set unique transaction id
transactions <- arrange(transactions, transact_date)
transactions$transact_id <- 1:nrow(transactions)

# reorder columns
transactions <- select(transactions, transact_id, 1:3)

# show first six rows of transaction data frame
head(transactions)

## Source: local data frame [6 x 4]
## 
##   transact_id player_id transact_date transact_amount
## 1           1         O    2014-01-22           788.2
## 2           2         S    2014-03-10           661.6
## 3           3         E    2014-03-21           720.1
## 4           4         E    2014-03-23           435.1
## 5           5         C    2014-04-12           939.5
## 6           6         Q    2014-05-05           141.9

Create summary data frame showing number of days from player start date to first transaction date and sum of total transaction amounts.

days_to_transact_amt <- transactions %>%
  group_by(player_id) %>%
  summarise(first_transact = min(transact_date), total_amt = sum(transact_amount))
  
# add start.date
days_to_transact_amt <- left_join(player, days_to_transact_amt, by = "player_id")

# calculate number of days froms start date to first transaction
days_to_transact_amt$num_days <- ymd(days_to_transact_amt$first_transact) - ymd(days_to_transact_amt$start_date)

# show first six rows
head(days_to_transact_amt)

##   player_id start_date first_transact total_amt num_days
## 1         A 2014-09-30     2014-12-24      2629  85 days
## 2         B 2014-10-13     2014-11-11      3207  29 days
## 3         C 2014-01-09     2014-04-12      4008  93 days
## 4         D 2014-10-11     2014-11-25      4178  45 days
## 5         E 2014-01-26     2014-03-21      3145  54 days
## 6         F 2014-08-24     2014-11-28      1867  96 days

Add variables to transactions data frame showing number of days since last transaction.

# order data frame by player_id and transaction date
transactions <- arrange(transactions, player_id, transact_date)

# UPDATED 2015-08-31 using lag(): calculate time since last transaction

transactions <- transactions %>% 
  group_by(player_id) %>% 
  mutate(last_transact_date = lag(transact_date))

transactions$days_last_transact <- ymd(transactions$transact_date) - ymd(transactions$last_transact_date)

transactions <- select(transactions, -last_transact_date)

# show first twelve rows of transactions data frame
head(transactions, 12)

## Source: local data frame [12 x 5]
## Groups: player_id
## 
##    transact_id player_id transact_date transact_amount days_last_transact
## 1           51         A    2014-12-24           633.8            NA days
## 2           82         A    2015-04-30           799.4           127 days
## 3           87         A    2015-05-28           213.7            28 days
## 4          100         A    2015-08-04           665.5            68 days
## 5          105         A    2015-08-27           316.3            23 days
## 6           35         B    2014-11-11           431.4            NA days
## 7           47         B    2014-12-05           540.4            24 days
## 8           49         B    2014-12-09           827.8             4 days
## 9           69         B    2015-03-01           515.5            82 days
## 10          88         B    2015-06-02           177.7            93 days
## 11          99         B    2015-08-04           714.2            63 days
## 12           5         C    2014-04-12           939.5            NA days

K means clustering example

Example of K-means clustering. First make exploratory plot to eyeball a guess for number of clusters.

# exploratory plot
ggplot(data = days_to_transact_amt, aes(x = as.integer(num_days), y = total_amt)) +
  geom_point(size = 4) +
  xlab("Number of days from Start Date to First Transaction") +
  ylab("Total Transaction Amount per Player") +
  ggtitle("Exploratory Plot")

plot of chunk unnamed-chunk-5

Based on the plot we’ll use four clusters.

cluster_guess <- 4

# create data frame for k-means
kmeans_data <- select(days_to_transact_amt, num_days, total_amt)
kmeans_data$num_days <- as.numeric(kmeans_data$num_days)

# create k-means object
km_obj <- kmeans(kmeans_data, centers = cluster_guess)

# add cluster group to dataset
days_to_transact_amt$km_group <- km_obj$cluster

# create data frame of k-means centers for plot
km_centers <- as.data.frame(km_obj$centers)

# plot clusters
ggplot() +
  geom_point(data = days_to_transact_amt, aes(x = as.integer(num_days), y = total_amt, colour = factor(km_group)), size = 4) +
  xlab("Number of days from Start Date to First Transaction") +
  ylab("Total Transaction Amount per Player") +
  geom_point(data = km_centers, aes(x = num_days, y = total_amt, colour = factor(c(1:cluster_guess))), shape = 3, size = 6) +
  theme(legend.position = "none") +
  ggtitle("K-means Clustering Example")

plot of chunk unnamed-chunk-6

The num_days and total_amt may not be optimal variables to perform an actual K-means analysis but they work for this illustrative example.

Data Aggregation and K-means example

Andy Rosa

August 19, 2015

Aggregating Data

K means clustering example