A quick reproducible example of summarizing data and K-means clustering on a randomly generated gaming dataset. Steps in the analysis are mostly commented.
Start by setting up player data frame.
# load packages
library(dplyr)
library(lubridate)
library(ggplot2)
# set seed for reproducibility
set.seed(234)
# set up player IDs
player.ids <- c(LETTERS[1:20])
# set up random start dates
start.dates <- as.Date("2014-01-01") + as.integer(runif(20, 1, 365))
# create data frame
player <- data.frame(player_id = player.ids,
start_date = start.dates,
stringsAsFactors = FALSE)
# show first six rows of player data frame
head(player)
## player_id start_date
## 1 A 2014-09-30
## 2 B 2014-10-13
## 3 C 2014-01-09
## 4 D 2014-10-11
## 5 E 2014-01-26
## 6 F 2014-08-24
Next create a transactions data frame.
# create vector with random number (between 1 and 10) of transactions per player
transacts <- as.integer(runif(20, 1, 10))
# create records of transactions for each of the 20 players with transaction amounts
# between 100 and 1000
i <- 1
for (i in 1:length(unique(player$player_id))){
transact_temp <- data.frame(player_id = rep(player$player_id[i], transacts[i]),
transact_date = player$start_date[i] + runif(transacts[i], 1, 365),
transact_amount = round(runif(transacts[i], 100, 1000), 2),
stringsAsFactors = FALSE)
if (i == 1){
transactions <- transact_temp
}
else {
transactions <- bind_rows(transactions, transact_temp)
}
}
# clean up
rm(i, transact_temp, transacts, player.ids, start.dates)
# set unique transaction id
transactions <- arrange(transactions, transact_date)
transactions$transact_id <- 1:nrow(transactions)
# reorder columns
transactions <- select(transactions, transact_id, 1:3)
# show first six rows of transaction data frame
head(transactions)
## Source: local data frame [6 x 4]
##
## transact_id player_id transact_date transact_amount
## 1 1 O 2014-01-22 788.2
## 2 2 S 2014-03-10 661.6
## 3 3 E 2014-03-21 720.1
## 4 4 E 2014-03-23 435.1
## 5 5 C 2014-04-12 939.5
## 6 6 Q 2014-05-05 141.9
Create summary data frame showing number of days from player start date to first transaction date and sum of total transaction amounts.
days_to_transact_amt <- transactions %>%
group_by(player_id) %>%
summarise(first_transact = min(transact_date), total_amt = sum(transact_amount))
# add start.date
days_to_transact_amt <- left_join(player, days_to_transact_amt, by = "player_id")
# calculate number of days froms start date to first transaction
days_to_transact_amt$num_days <- ymd(days_to_transact_amt$first_transact) - ymd(days_to_transact_amt$start_date)
# show first six rows
head(days_to_transact_amt)
## player_id start_date first_transact total_amt num_days
## 1 A 2014-09-30 2014-12-24 2629 85 days
## 2 B 2014-10-13 2014-11-11 3207 29 days
## 3 C 2014-01-09 2014-04-12 4008 93 days
## 4 D 2014-10-11 2014-11-25 4178 45 days
## 5 E 2014-01-26 2014-03-21 3145 54 days
## 6 F 2014-08-24 2014-11-28 1867 96 days
Add variables to transactions data frame showing number of days since last transaction.
# order data frame by player_id and transaction date
transactions <- arrange(transactions, player_id, transact_date)
# UPDATED 2015-08-31 using lag(): calculate time since last transaction
transactions <- transactions %>%
group_by(player_id) %>%
mutate(last_transact_date = lag(transact_date))
transactions$days_last_transact <- ymd(transactions$transact_date) - ymd(transactions$last_transact_date)
transactions <- select(transactions, -last_transact_date)
# show first twelve rows of transactions data frame
head(transactions, 12)
## Source: local data frame [12 x 5]
## Groups: player_id
##
## transact_id player_id transact_date transact_amount days_last_transact
## 1 51 A 2014-12-24 633.8 NA days
## 2 82 A 2015-04-30 799.4 127 days
## 3 87 A 2015-05-28 213.7 28 days
## 4 100 A 2015-08-04 665.5 68 days
## 5 105 A 2015-08-27 316.3 23 days
## 6 35 B 2014-11-11 431.4 NA days
## 7 47 B 2014-12-05 540.4 24 days
## 8 49 B 2014-12-09 827.8 4 days
## 9 69 B 2015-03-01 515.5 82 days
## 10 88 B 2015-06-02 177.7 93 days
## 11 99 B 2015-08-04 714.2 63 days
## 12 5 C 2014-04-12 939.5 NA days
Example of K-means clustering. First make exploratory plot to eyeball a guess for number of clusters.
# exploratory plot
ggplot(data = days_to_transact_amt, aes(x = as.integer(num_days), y = total_amt)) +
geom_point(size = 4) +
xlab("Number of days from Start Date to First Transaction") +
ylab("Total Transaction Amount per Player") +
ggtitle("Exploratory Plot")
Based on the plot we’ll use four clusters.
cluster_guess <- 4
# create data frame for k-means
kmeans_data <- select(days_to_transact_amt, num_days, total_amt)
kmeans_data$num_days <- as.numeric(kmeans_data$num_days)
# create k-means object
km_obj <- kmeans(kmeans_data, centers = cluster_guess)
# add cluster group to dataset
days_to_transact_amt$km_group <- km_obj$cluster
# create data frame of k-means centers for plot
km_centers <- as.data.frame(km_obj$centers)
# plot clusters
ggplot() +
geom_point(data = days_to_transact_amt, aes(x = as.integer(num_days), y = total_amt, colour = factor(km_group)), size = 4) +
xlab("Number of days from Start Date to First Transaction") +
ylab("Total Transaction Amount per Player") +
geom_point(data = km_centers, aes(x = num_days, y = total_amt, colour = factor(c(1:cluster_guess))), shape = 3, size = 6) +
theme(legend.position = "none") +
ggtitle("K-means Clustering Example")
The num_days and total_amt may not be optimal variables to perform an actual K-means analysis but they work for this illustrative example.