This is an introductory exploratory analysis focusing on the problem of guild entering event prediction (i.e.Ā events when avatars join guilds).
Setting some basic parameters, loading packages and functions, reading data:
avatar_graph_time_window <- 30
prediction_time_window <- 7
library(dplyr)
library(lubridate)
library(data.table)
library(pROC)
library(cvTools)
library(readr)
library(igraph)
library(tidyr)
library(igraph)
library(ggplot2)
library(cowplot)
library(data.table)
source("common/init.R")##
Read 11.7% of 10826734 rows
Read 31.3% of 10826734 rows
Read 47.4% of 10826734 rows
Read 64.7% of 10826734 rows
Read 80.5% of 10826734 rows
Read 85.0% of 10826734 rows
Read 10826734 rows and 7 (of 7) columns from 0.599 GB file in 00:00:10
source("guild_quitting/create_intraguild_sn_timewindow.R")
prediction_dates_train <- as.character(seq(as.Date("2008-02-01"), as.Date("2008-10-25"), 7))
prediction_dates_test <- as.character(seq(as.Date("2008-11-01"), as.Date("2008-12-25"), 7))
set.seed(0)
### Add zone type column
zones <- read_csv(paste(data_dir, "zones.csv", sep = ""))
wow <- left_join(wow, zones %>%
select(Type, Zone_Name) %>%
mutate(zone_type = Type, zone = Zone_Name, Type = NULL, Zone_Name = NULL),
by = "zone")
rm(zones)We transform the original data to create a strutured dataset containing samples that can be used for guild event prediction. Each sample corresponds to a prediction date-avatar pair, and the target variable indicates which guild was joined by the avatar within the prediction time window after the current prediction date.
compute_features_and_labels_pred_date <- function(data, pred_date){
print("compute_features_and_labels --- start")
pred_date <- as.Date(pred_date)
testset_end_date <- pred_date + prediction_time_window
# Keep only the data regarding known avatars
#avatars_in_guilds <- intraguild_graphs$nodes
#data <- data %>% filter(avatar %in% avatars_in_guilds$avatar)
min_date <- min(data$current_date)
##### Compute labels for avatars
test_data <- data %>% filter(current_date >= pred_date & current_date < testset_end_date)
labels <- test_data %>%
filter(event == "Guild Entered" | event == "Guild Changed") %>%
group_by(avatar) %>%
slice(1) %>%
group_by() %>%
select(avatar, current_date, guild, event, prev_guild)
train_data <- data %>% filter(current_date < pred_date)
# get the avatars that are not guild members on the prediction date
# out_of_guild_avatars <- train_data %>%
# group_by(avatar) %>%
# dplyr::slice(n()) %>%
# group_by() %>%
# filter(guild == -1) %>% # ezt kiszedni?
# select(avatar, guild)
## Features: char, race, level, number of different guild the avatar had been member of
features <- train_data %>%
#filter(avatar %in% out_of_guild_avatars$avatar) %>% # ezt kiszedni?
group_by(avatar) %>%
summarise(
race = race[1],
charclass = charclass[1],
level = max(level),
diff_guild_count = length(unique(guild[guild != -1])))
### ...
### Compute various features here...
### ...
##### Joining labels
print("Joining features and labels")
features_and_labels <- left_join(features, labels, by = "avatar")
features_and_labels$pred_date <- pred_date
features_and_labels$testset_end_date <- testset_end_date
features_and_labels
features_and_labels <- features_and_labels #%>% filter(!is.na(guild))
features_and_labels <- features_and_labels %>% mutate(known_label = ifelse(
!is.na(guild) & guild %in% unique(train_data$guild), TRUE, FALSE))
##### RETURN
print("compute_features_and_labels --- return")
features_and_labels
}
compute_features_and_labels <- function(pred_dates){
dataset <- data.frame()
datasets_list <- lapply(pred_dates, function(pred_date){
print(paste("Computing features and labels for prediction date", pred_date))
data_subset <- compute_features_and_labels_pred_date(wow, pred_date)
data_subset
})
datasets_list
#concat result datasets
lapply(datasets_list, function(data_subset){
if (nrow(dataset) == 0){
dataset <<- data_subset
} else {
dataset <<- rbind(dataset, data_subset)
}
c()
})
dataset
}
data_train <- compute_features_and_labels(prediction_dates_train)## [1] "Computing features and labels for prediction date 2008-02-01"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-02-08"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-02-15"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-02-22"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-02-29"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-03-07"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-03-14"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-03-21"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-03-28"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-04-04"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-04-11"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-04-18"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-04-25"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-05-02"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-05-09"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-05-16"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-05-23"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-05-30"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-06-06"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-06-13"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-06-20"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-06-27"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-07-04"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-07-11"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-07-18"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-07-25"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-08-01"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-08-08"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-08-15"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-08-22"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-08-29"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-09-05"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-09-12"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-09-19"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-09-26"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-10-03"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-10-10"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-10-17"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
## [1] "Computing features and labels for prediction date 2008-10-24"
## [1] "compute_features_and_labels --- start"
## [1] "Joining features and labels"
## [1] "compute_features_and_labels --- return"
# number of all samples
nrow(data_train)## [1] 726800
# training data
glimpse(data_train)## Observations: 726,800
## Variables: 12
## $ avatar <chr> "1003.Orc.Warrior", "10043.Undead.Rogue", "10...
## $ race <chr> "Orc", "Undead", "Tauren", "Undead", "Tauren"...
## $ charclass <chr> "Warrior", "Rogue", "Warrior", "Mage", "Hunte...
## $ level <int> 70, 70, 20, 60, 70, 70, 12, 57, 70, 67, 52, 7...
## $ diff_guild_count <int> 2, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 2, 1, 1, ...
## $ current_date <date> 2008-02-05, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ guild <int> 281, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ event <chr> "Guild Entered", NA, NA, NA, NA, NA, NA, NA, ...
## $ prev_guild <dbl> -1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ pred_date <date> 2008-02-01, 2008-02-01, 2008-02-01, 2008-02-...
## $ testset_end_date <date> 2008-02-08, 2008-02-08, 2008-02-08, 2008-02-...
## $ known_label <lgl> TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS...
Total number of samples:
data_train %>% summarise(samples_all = n(),
samples_with_labels = sum(!is.na(guild)),
samples_with_known_labels = sum(known_label))## # A tibble: 1 Ć 3
## samples_all samples_with_labels samples_with_known_labels
## <int> <int> <int>
## 1 726800 18010 17377
Number of samples by guilds:
data_train %>% group_by(guild) %>%
summarise(n = n()) %>%
ggplot(aes(x = guild)) + geom_bar(aes(y = n), stat = 'identity') +
theme_bw() +
labs(x = "Guild ID", y = "Number of Samples")Number of samples by prediction date and label type:
data_train %>%
group_by(pred_date) %>%
summarise(samples_all = n(),
samples_with_labels = sum(!is.na(guild)),
samples_with_known_labels = sum(known_label)) %>%
gather(type, num_samples, -pred_date) %>%
mutate(type = factor(type, levels = c("samples_all", "samples_with_labels", "samples_with_known_labels"))) %>%
ggplot(aes(x = pred_date, y = num_samples)) +
geom_bar(aes(fill = type),
stat = 'identity',
position = 'dodge') +
geom_text(aes(color = type, label = num_samples),
position = position_dodge(width = 30),
angle = 45, vjust = -0.6, hjust = 0.15) +
theme_bw() +
labs(x = "Prediction Date", y = "Number of Samples")Percentage of samples belonging to guilds with the most samples:
# percentage of samples corresponding to the top guildy by the number of top guilds
plot_samplepercent_of_top_guilds <- function(data){
data <- data %>% filter(!is.na(guild))
samples_by_guilds <- data %>% group_by(guild) %>%
summarise(n = n()) %>% arrange(desc(n))
samples_by_top_guilds_df <- data.frame(number_of_top_guilds = 1:nrow(samples_by_guilds)) %>%
mutate(percent_of_samples =
sapply(1:max(number_of_top_guilds), function(x){sum(samples_by_guilds$n[1:x])})
/ nrow(data) * 100)
samples_by_top_guilds_df %>%
ggplot(aes(x = number_of_top_guilds, y = percent_of_samples)) +
geom_line(color = "steelblue") + theme_bw() +
labs(x = "Number of Top Guilds Owning the Most Samples", y = "Percent of Comprised Samples",
title = paste(min(data$pred_date), max(data$pred_date), sep = "-"))
}
plot_samplepercent_of_top_guilds(data_train)selected_pred_dates <- sort(sample(unique(as.character(data_train$pred_date)), 8))
plots <- lapply(selected_pred_dates, function(current_pred_date){
plot_samplepercent_of_top_guilds(data_train %>% filter(as.character(pred_date) == current_pred_date))
})
plot_grid(plots[[1]], plots[[2]], plots[[3]], plots[[4]], plots[[5]], plots[[6]], plots[[7]], plots[[8]], nrow = 2)# number of labeled samples of top guilds by time
top_guilds_by_time <- data_train %>%
filter(!is.na(guild)) %>%
group_by(guild) %>%
mutate(samples_of_guild = n()) %>% group_by %>%
filter(samples_of_guild > 10) %>% # consider only the guilds that have at least N samples
group_by(pred_date, guild) %>%
summarise(number_of_samples = n()) %>%
arrange(pred_date, desc(number_of_samples)) %>%
mutate(guild = factor(guild))
head(top_guilds_by_time, 100)## Source: local data frame [100 x 3]
## Groups: pred_date [2]
##
## pred_date guild number_of_samples
## <date> <fctr> <int>
## 1 2008-02-01 103 99
## 2 2008-02-01 282 80
## 3 2008-02-01 204 51
## 4 2008-02-01 104 43
## 5 2008-02-01 101 31
## 6 2008-02-01 167 27
## 7 2008-02-01 174 22
## 8 2008-02-01 53 20
## 9 2008-02-01 35 18
## 10 2008-02-01 189 17
## # ... with 90 more rows
Check if there is any correlation between the number of samples for subsequent prediction dates for individual guilds. We only show guilds that have at least a certain number of samples.
ggplot(top_guilds_by_time, aes(x = pred_date, y = number_of_samples, group = guild, color = guild)) +
geom_line() +
theme_bw() + labs(
x = "Prediction Date",
y = "Number of Samples",
title = "Number of Samples with Labels by Prediction Date and Guild")The same thing on a heatmap:
ggplot(top_guilds_by_time, aes(x = pred_date, y = guild)) +
geom_tile(aes(fill = number_of_samples), color = "white") +
theme_bw() +
labs(x = 'Prediction Date',
y = 'Guild ID',
title = "Number of Samples with Labels by Prediction Date and Guild") +
#scale_fill_distiller(palette = "Spectral")
scale_fill_gradient(low = "#ececec", high = "#000251") +
geom_text(aes(label = number_of_samples), color = 'orange', size = 2.2)The average autocorrelation of the number of samples on the subsequent prediction dates of individual guilds (the average autocorrelations for the rows of the heatmap):
acf_values <- top_guilds_by_time %>% group_by(guild) %>% do({
tmp <- acf(.$number_of_samples, plot = FALSE)
result <- as.data.frame(matrix(tmp$acf, nrow = 1))
names(result) <- paste0("lag", 0:(length(tmp$acf) - 1))
result
})
acf_values$guild <- 0
colMeans(acf_values, na.rm = T)## guild lag0 lag1 lag2 lag3
## 0.000000000 1.000000000 0.158266137 0.073840246 0.033483074
## lag4 lag5 lag6 lag7 lag8
## 0.025023646 -0.018173877 -0.004746851 -0.033590460 -0.025330499
## lag9 lag10 lag11 lag12 lag13
## -0.038644033 -0.052794758 -0.064635283 -0.080388623 -0.057819792
## lag14 lag15
## -0.065378616 -0.024600072
Few helper functions to compute the size of guilds and to construct the network of avatars on a specific prediction date:
# get guild sizes for prediction dates
get_guild_sizes_for_pred_date <- function(pred_date){
guild_sizes <- wow %>%
filter(current_date < as.Date(pred_date)) %>%
group_by(avatar) %>%
slice(n()) %>%
group_by(guild) %>%
summarise(size = n()) %>%
left_join(wow %>%
filter(current_date < as.Date(pred_date)) %>%
filter(current_date >= as.Date(pred_date) - avatar_graph_time_window) %>%
group_by(avatar) %>%
slice(n()) %>%
group_by(guild) %>%
summarise(effective_size = n())) %>%
mutate(effective_size = ifelse(is.na(effective_size), 0, effective_size),
pred_date = pred_date)
}
guild_sizes_by_pred_date_list <- lapply(prediction_dates_train, get_guild_sizes_for_pred_date)
guild_sizes_by_pred_date <- rbindlist(guild_sizes_by_pred_date_list)
# get the avatar network for a given prediction date
get_extended_graph_data <- function(current_pred_date){
graph_data <- create_intraguild_graphs(wow,
current_pred_date,
time_window = avatar_graph_time_window)
vertices_df <- wow %>%
filter(current_date < as.Date(current_pred_date)) %>%
filter(current_date >= as.Date(current_pred_date) - avatar_graph_time_window) %>%
group_by(avatar) %>%
slice(n()) %>% group_by() %>%
select(avatar, guild) %>%
left_join(data_train %>%
filter(pred_date == current_pred_date) %>%
select(avatar, label_guild = guild))
edges_symetric <- rbind(graph_data$edges, data.frame(
node_1 = graph_data$edges$node_2,
node_2 = graph_data$edges$node_1,
weight = graph_data$edges$weight
))
# adding guilds to graph nodes
edges_symetric <- edges_symetric %>%
left_join(vertices_df %>% mutate(node_1 = avatar) %>% select(node_1, guild)) %>%
mutate(node_1_guild = guild, guild = NULL) %>%
left_join(vertices_df %>% mutate(node_2 = avatar) %>% select(node_2, guild)) %>%
mutate(node_2_guild = guild, guild = NULL)
list(graph_data = graph_data,
vertices_df = vertices_df,
edges_symetric = edges_symetric,
pred_date = current_pred_date)
}Construct the network of avatars for each prediction date:
avatar_network_list <- sapply(prediction_dates_train, function(current_pred_date){
get_extended_graph_data(current_pred_date)
}, USE.NAMES = TRUE, simplify = FALSE)
edges_symetric_all <- rbindlist(lapply(avatar_network_list, function(x){
tmp <- x$edges_symetric
tmp$pred_date <- x$pred_date
tmp
}))
vertices_df_all <- rbindlist(lapply(avatar_network_list, function(x){
tmp <- x$vertices_df
tmp$pred_date <- x$pred_date
tmp
}))get_conn_between_avatars_and_guilds <- function(current_pred_date, avatar_network){
guild_sizes_on_pred_date <- get_guild_sizes_for_pred_date(current_pred_date)
tmp <- avatar_network$edges_symetric %>% mutate(avatar = node_1) %>%
filter(node_2_guild != -1) %>%
mutate(guild = node_2_guild) %>%
left_join(guild_sizes_on_pred_date) %>%
mutate(guild = NULL) %>%
left_join(avatar_network$vertices_df) %>% group_by(avatar) %>%
summarise(degree_to_target_guild = sum(weight[node_2_guild == label_guild]),
degree_ratio_to_target_guild = degree_to_target_guild / size[node_2_guild == label_guild][1])
tmp2 <- avatar_network$edges_symetric %>% mutate(avatar = node_1) %>%
filter(node_2_guild != -1) %>%
mutate(guild = node_2_guild) %>%
left_join(guild_sizes_on_pred_date) %>%
mutate(guild = NULL) %>%
group_by(avatar, node_2_guild) %>%
summarise(degree_to_guild = sum(weight), size = size[1]) %>%
summarise(degree_to_guild_max = max(degree_to_guild),
degree_to_guild_max_guild = node_2_guild[which.max(degree_to_guild)],
degree_ratio_to_guild_max = max(degree_to_guild/size),
degree_ratio_to_guild_max_guild = node_2_guild[which.max(degree_to_guild/size)])
node_degrees <- avatar_network$vertices_df %>%
filter(!is.na(label_guild)) %>%
left_join(avatar_network$edges_symetric %>% mutate(avatar = node_1) %>% group_by(avatar) %>%
summarise(degree_all = sum(weight))) %>%
left_join(tmp) %>%
left_join(tmp2) %>%
mutate(joined_max = (label_guild == degree_to_guild_max_guild)) %>%
left_join(guild_sizes_on_pred_date %>%
mutate(label_guild = guild, label_guild_size = size) %>%
select(label_guild, label_guild_size, pred_date)) %>%
mutate(joined_max_ratio = (label_guild == degree_ratio_to_guild_max_guild))
node_degrees$pred_date <- current_pred_date
node_degrees
}
print(class(prediction_dates_train))## [1] "character"
node_degrees_list <- lapply(prediction_dates_train, function(current_pred_date){
get_conn_between_avatars_and_guilds(current_pred_date, avatar_network_list[[current_pred_date]])
})
node_degrees <- rbindlist(node_degrees_list)How many samples contain an avatar who joined its most connected guild:
summary(node_degrees$joined_max)## Mode FALSE TRUE NA's
## logical 10979 5662 626
summary(node_degrees$joined_max_ratio)## Mode FALSE TRUE NA's
## logical 13432 3209 626
Largest guilds =?= Most prevalent guilds among the target guilds where joined_max or joined_max_ratio is TRUE
# largest guilds (most members)
guild_sizes_by_pred_date %>%
filter(guild != -1) %>%
select(pred_date, guild, size) %>%
group_by(pred_date) %>%
slice(1:min(nrow(.), 5)) %>%
group_by() %>%
arrange(pred_date, desc(size))## # A tibble: 195 Ć 3
## pred_date guild size
## <chr> <int> <int>
## 1 2008-02-01 5 188
## 2 2008-02-01 3 18
## 3 2008-02-01 4 15
## 4 2008-02-01 8 4
## 5 2008-02-01 6 1
## 6 2008-02-08 5 186
## 7 2008-02-08 3 19
## 8 2008-02-08 4 16
## 9 2008-02-08 1 1
## 10 2008-02-08 6 1
## # ... with 185 more rows
# guilds with most samples
node_degrees %>%
group_by(pred_date, label_guild) %>%
summarise(size = label_guild_size[1], samples = n()) %>%
slice(1:min(nrow(.), 5)) %>%
arrange(pred_date, desc(samples))## Source: local data frame [195 x 4]
## Groups: pred_date [39]
##
## pred_date label_guild size samples
## <chr> <int> <int> <int>
## 1 2008-02-01 5 188 14
## 2 2008-02-01 9 51 5
## 3 2008-02-01 4 15 3
## 4 2008-02-01 3 18 1
## 5 2008-02-01 16 6 1
## 6 2008-02-08 5 186 14
## 7 2008-02-08 9 51 8
## 8 2008-02-08 19 116 6
## 9 2008-02-08 16 5 3
## 10 2008-02-08 4 16 2
## # ... with 185 more rows
# guilds with the most samples where the avatar joined the most connected guild (sum of connections)
node_degrees %>%
filter(joined_max) %>%
group_by(pred_date, label_guild) %>%
summarise(size = label_guild_size[1], samples = n()) %>%
slice(1:min(nrow(.), 5)) %>%
arrange(pred_date, desc(samples))## Source: local data frame [195 x 4]
## Groups: pred_date [39]
##
## pred_date label_guild size samples
## <chr> <int> <int> <int>
## 1 2008-02-01 35 152 3
## 2 2008-02-01 53 194 3
## 3 2008-02-01 19 140 1
## 4 2008-02-01 62 77 1
## 5 2008-02-01 79 77 1
## 6 2008-02-08 101 225 15
## 7 2008-02-08 62 87 5
## 8 2008-02-08 35 154 3
## 9 2008-02-08 53 196 1
## 10 2008-02-08 79 83 1
## # ... with 185 more rows
# guilds with the most samples where the avatar joined the most connected guild (mean connectedness)
node_degrees %>%
filter(joined_max_ratio) %>%
group_by(pred_date, label_guild) %>%
summarise(size = label_guild_size[1], samples = n()) %>%
slice(1:min(nrow(.), 5)) %>%
arrange(pred_date, desc(samples))## Source: local data frame [195 x 4]
## Groups: pred_date [39]
##
## pred_date label_guild size samples
## <chr> <int> <int> <int>
## 1 2008-02-01 35 152 5
## 2 2008-02-01 19 140 4
## 3 2008-02-01 5 188 3
## 4 2008-02-01 4 15 1
## 5 2008-02-01 53 194 1
## 6 2008-02-08 35 154 4
## 7 2008-02-08 5 186 2
## 8 2008-02-08 4 16 1
## 9 2008-02-08 16 5 1
## 10 2008-02-08 19 116 1
## # ... with 185 more rows
Difference in the degree of connectedness between avatars who had joined their most connected guild and avatars who hadnāt:
summary(node_degrees$degree_to_guild_max[!is.na(node_degrees$degree_to_guild_max) & node_degrees$joined_max])## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 604 2139 4630 6172 34480
summary(node_degrees$degree_to_guild_max[!is.na(node_degrees$degree_to_guild_max) & !node_degrees$joined_max])## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 112 570 1137 1600 33620
ggplot(node_degrees, aes(x = degree_to_guild_max)) +
geom_density(aes(fill = joined_max, color = joined_max), alpha = 0.2) +
theme_bw() +
labs(x = "Maximal Connectedness to a Guild",
y = "Density",
title = "Distribution of the Maximal Guild Connectednesses of Avatars",
subtitle = "Two Groups: Whether Avatars Joined Their Most Connected Guild or Not")get_aggregated_avatar_guild_connections <- function(current_pred_date, avatar_network){
connectedness_to_guilds <- avatar_network$edges_symetric %>% mutate(avatar = node_1) %>%
filter(node_2_guild != -1) %>%
mutate(guild = node_2_guild) %>%
left_join(guild_sizes_by_pred_date %>%
filter(pred_date == current_pred_date)) %>%
mutate(guild = NULL) %>%
group_by(avatar, node_2_guild) %>%
summarise(degree_to_guild = sum(weight),
size = size[1],
effective_size = effective_size[1],
mean_degree = degree_to_guild / size,
effective_mean_degree = degree_to_guild / effective_size) %>%
mutate(guild = node_2_guild, node_2_guild = NULL) %>% group_by()
avatars_and_guilds <- expand.grid(unique(avatar_network$edges_symetric$node_1), unique(avatar_network$edges_symetric$node_2_guild), stringsAsFactors = FALSE)
names(avatars_and_guilds) <- c("avatar", "guild")
avatar_guild_connectedness_for_known_samples <- avatars_and_guilds %>%
left_join(connectedness_to_guilds) %>%
mutate(mean_degree = ifelse(is.na(mean_degree), 0, mean_degree),
effective_mean_degree = ifelse(is.na(effective_mean_degree), 0, effective_mean_degree)) %>%
right_join(data_train %>%
filter(pred_date == current_pred_date & !is.na(guild) & known_label) %>%
select(avatar, label_guild = guild)) %>%
mutate(is_target_guild = (!is.na(label_guild) & label_guild == guild))
connectedness_to_guilds$pred_date <- current_pred_date
avatar_guild_connectedness_for_known_samples$pred_date <- current_pred_date
list(all_avatar_guild_conns = connectedness_to_guilds,
known_samples = avatar_guild_connectedness_for_known_samples)
}
avatar_guild_connectedness_list <- lapply(prediction_dates_train, function(current_pred_date){
get_aggregated_avatar_guild_connections(current_pred_date, avatar_network_list[[current_pred_date]])
})
avatar_guild_connectedness_all_conns <- rbindlist(
lapply(avatar_guild_connectedness_list, function(x){x$all_avatar_guild_conns}))
avatar_guild_connectedness_known_samples <- rbindlist(
lapply(avatar_guild_connectedness_list, function(x){x$known_samples}))If connectedness is measured by the effective mean degree (the mean connectedness to active guild members):
avatar_guild_connectedness_known_samples %>% filter(effective_mean_degree > 5) %>%
ggplot(aes(x = effective_mean_degree)) +
geom_density(aes(fill = is_target_guild, color = is_target_guild), alpha = 0.2) + theme_bw()If connectedness is measured by the simple mean degree (the mean connectedness to guild members):
avatar_guild_connectedness_known_samples %>% filter(mean_degree > 5) %>%
ggplot(aes(x = mean_degree)) +
geom_density(aes(fill = is_target_guild, color = is_target_guild), alpha = 0.2) + theme_bw()Connectedness rank: for each avatar guilds are sorted in descending order according to their connectedness to the given avatar. The rank is the position of the avatarās target guild in this list.
target_guild_connectedness_with_ranks <-
avatar_guild_connectedness_all_conns %>%
group_by(pred_date, avatar) %>%
arrange(desc(effective_mean_degree)) %>%
#arrange(desc(mean_degree)) %>%
mutate(rank = seq_along(effective_mean_degree)) %>%
group_by() %>%
right_join(data_train %>%
mutate(pred_date = as.character(pred_date)) %>%
filter(#pred_date == "2008-02-01" &
!is.na(guild) & known_label) %>%
select(avatar, pred_date, label_guild = guild)) %>%
filter(!is.na(label_guild) & label_guild == guild)
# histogram:
ggplot(target_guild_connectedness_with_ranks, aes(x = rank)) +
geom_histogram(binwidth = 1, fill = 'steelblue', alpha = 0.6) + theme_bw() +
labs(title = "Rank of a Sample:", subtitle = "Which most connected guild is the target guild")# density function:
#ggplot(target_guild_connectedness_with_ranks, aes(x = rank)) +
# geom_density(fill = 'steelblue', alpha = 0.6) + theme_bw() +
# labs(title = "Rank of a Sample:", subtitle = "Which most connected guild is the target guild")tmp3 <- edges_symetric_all %>% mutate(avatar = node_1) %>%
filter(node_2_guild != -1) %>%
group_by(avatar, pred_date) %>% # ide a group_by-hoz hozzƔadni a pred_date-t
arrange(desc(weight)) %>%
slice(1:10) %>%
mutate(rank1 = paste0("rank-", seq_len(n()), "-guild")) %>%
mutate(rank2 = paste0("rank-", seq_len(n()), "-weight")) %>%
select(avatar, weight, node_2_guild, node_2, rank1, rank2) %>% # ide is pred_date
group_by()
tmp4 <- tmp3 %>%
group_by(pred_date) %>% #talĆ”n Ćgy jó lesz
select(avatar, node_2_guild, rank1) %>%
spread(key = rank1, value = node_2_guild) %>%
group_by()
tmp5 <- tmp3 %>%
group_by(pred_date) %>% #talĆ”n Ćgy jó lesz
select(avatar, weight, rank2) %>%
spread(key = rank2, value = weight) %>%
group_by()
node_degrees_to_nodes <- vertices_df_all %>%
filter(!is.na(label_guild)) %>%
left_join(edges_symetric_all %>% mutate(avatar = node_1) %>% group_by(avatar) %>%
summarise(degree_all = sum(weight))) %>%
left_join(tmp4) %>%
left_join(tmp5)
nrow(node_degrees_to_nodes)## [1] 17267
head(node_degrees_to_nodes)## avatar guild label_guild pred_date degree_all
## 1 1003.Orc.Warrior 281 281 2008-02-01 394032
## 2 10543.Undead.Mage 9 9 2008-02-01 275028
## 3 11043.Troll.Hunter 165 90 2008-02-01 90342
## 4 1131.Troll.Hunter 101 101 2008-02-01 419148
## 5 1154.Orc.Hunter -1 282 2008-02-01 8768
## 6 11589.Undead.Priest 189 189 2008-02-01 579260
## rank-10-guild rank-1-guild rank-2-guild rank-3-guild rank-4-guild
## 1 204 204 204 204 204
## 2 161 161 167 19 189
## 3 35 35 103 220 103
## 4 101 101 101 101 101
## 5 243 103 165 189 5
## 6 167 189 189 282 103
## rank-5-guild rank-6-guild rank-7-guild rank-8-guild rank-9-guild
## 1 204 204 204 204 204
## 2 19 189 9 4 282
## 3 167 35 103 282 104
## 4 101 101 101 101 101
## 5 291 135 5 103 243
## 6 53 282 253 282 103
## rank-10-weight rank-1-weight rank-2-weight rank-3-weight rank-4-weight
## 1 266 350 336 332 324
## 2 32 102 94 52 44
## 3 44 362 200 100 58
## 4 574 684 680 642 626
## 5 28 40 36 34 34
## 6 42 114 88 58 56
## rank-5-weight rank-6-weight rank-7-weight rank-8-weight rank-9-weight
## 1 314 282 280 272 270
## 2 42 38 34 34 34
## 3 54 52 52 48 46
## 4 614 610 600 600 582
## 5 32 30 30 30 30
## 6 54 44 44 44 42
Distribution of edge weights towards other avatars by their guild (target/non-target). Note that only the avatars of the training data are considered (i.e.Ā avatars with no guilds on the current prediction date):
node_degrees_to_nodes_all <-
edges_symetric_all %>% mutate(avatar = node_1) %>%
filter(node_2_guild != -1) %>% # ez ki is veheto
right_join(data_train %>%
mutate(pred_date = as.character(pred_date)) %>%
mutate(label_guild = guild) %>%
#filter(known_label) %>%
filter(!is.na(label_guild)) %>%
select(avatar, label_guild, pred_date)
) %>%
mutate(to_target_guild = !is.na(label_guild) & !is.na(node_2_guild) & label_guild == node_2_guild)
node_degrees_to_nodes_all %>% filter(weight > 5) %>%
ggplot(aes(x = weight)) + #xlim(0, 200) +
geom_density(aes(fill = to_target_guild, color = to_target_guild), alpha = 0.2) + theme_bw()Ratio of target guild edges and other edges by the number of included top connections of avatars:
all_edges <- node_degrees_to_nodes_all %>% group_by(to_target_guild) %>% summarise(n = n())
top_edges <- edges_symetric_all %>% mutate(avatar = node_1) %>%
filter(node_2_guild != -1) %>%
group_by(avatar, pred_date) %>%
arrange(desc(weight)) %>%
slice(1:10) %>%
mutate(rank = paste0("top-", seq_len(n()))) %>%
select(avatar, weight, node_2_guild, node_2, rank, pred_date) %>%
group_by()
num_edges_by_number_of_top_connections <-
top_edges %>%
right_join(data_train %>%
mutate(pred_date = as.character(pred_date)) %>%
mutate(label_guild = guild) %>%
#filter(known_label) %>%
filter(!is.na(label_guild)) %>%
select(avatar, label_guild, pred_date)) %>%
filter(!is.na(weight)) %>% group_by(rank) %>%
summarise(edges_to_target_guild = sum(node_2_guild == label_guild),
edges_to_other_guilds = n() - edges_to_target_guild)
num_edges_by_number_of_top_connections2 <- as.data.frame(t(
sapply(seq_len(nrow(num_edges_by_number_of_top_connections)), function(x){
rank_values <- paste0("top-", 1:x)
tmp <- num_edges_by_number_of_top_connections %>%
filter(rank %in% rank_values)
edges_to_other_guilds <- sum(tmp$edges_to_other_guilds)
edges_to_target_guild <- sum(tmp$edges_to_target_guild)
c(rank = paste0("top-", x),
to_target_guild = edges_to_target_guild / (edges_to_target_guild + edges_to_other_guilds),
to_other_guilds = edges_to_other_guilds / (edges_to_target_guild + edges_to_other_guilds))
}))) %>%
mutate(rank = as.character(rank),
to_target_guild = as.double(as.character(to_target_guild)),
to_other_guilds = as.double(as.character(to_other_guilds))) %>%
rbind(data.frame(rank = "All Edges",
to_target_guild = all_edges$n[all_edges$to_target_guild] / sum(all_edges$n),
to_other_guilds = all_edges$n[!all_edges$to_target_guild] / sum(all_edges$n)))
num_edges_by_number_of_top_connections2 %>%
gather(key = type, value = edges_percent, -rank) %>%
mutate(rank = factor(rank, levels =
c(paste0("top-",1:nrow(num_edges_by_number_of_top_connections)), "All Edges"))) %>%
ggplot(aes(x = rank, y = edges_percent)) +
geom_bar(aes(fill = type), stat = 'identity', position = 'dodge') +
#geom_text(aes(label = paste0(100 * round(edges_percent, 4), "%"), color = type),
# position = position_dodge(width = .8), angle = 45, vjust = -0.6, hjust = 0.15) +
geom_label(aes(label = paste0(100 * round(edges_percent, 4), "%"), fill = type),
position = position_dodge(width = .8), vjust = -0.2, alpha = 0.65, color = 'white') +
theme_bw() +
labs(x = "How Many of the Avatars' Top-Weight Edges Are Considered",
y = "Percentage of Various Types of Edges",
title = "Percentage of Edges that Go to the Members Of the Target Guild",
subtitle = "Within Different Subsets of the Avatars' Top-Weight Edges")