Исключаем неактивных игроков, просматривая долю. Наверное их надо как-то учитывать, если считать модельки с ними, то смотреть в сторону biglm
nrow(r)
## [1] 53430
nrow(subset(r, Level >= 10))
## [1] 7911
df <- subset(r, Level >= 10)
# Внимательно проверяем гребанный формат времени в экселе. Справка по
# символам в ?strftime
df$regdate <- as.POSIXct(as.character(df$Reg.Time), format = "%m-%d-%y %H:%M")
# Выкидываем лиги, у которых нет игроков от 10 уровня
df$League <- factor(df$League)
Возраст игроков, по дням
df$daysreg <- as.integer(as.period(logdate - df$regdate)/days(1))
## estimate only: convert difftimes to intervals for accuracy
## estimate only: convert to intervals for accuracy
hist(df$daysreg)
Посмотрим на статистику по кланам
library(dplyr)
dfl <- group_by(df, League)
leagues.summary.all <- arrange(summarise(dfl, avg = mean(Level), med = median(Level),
ttl = sum(Level), iqr = IQR(Level), npeople = length(Level), avgdays = mean(daysreg),
meddays = median(daysreg), iqrdays = IQR(daysreg), sddays = sd(daysreg)),
desc(npeople), desc(med))
hist(leagues.summary.all$med)
head(leagues.summary.all)
## Source: local data frame [6 x 10]
##
## League avg med ttl iqr npeople avgdays meddays
## 1 25.49 23.0 141156 14.00 5537 331.8 350.0
## 338 No_Limit 91.75 90.5 6239 28.00 68 312.9 348.5
## 260 Insane 93.18 90.0 5777 29.25 62 323.0 346.5
## 188 Death_From_Heavens 95.23 100.0 5809 23.00 61 320.9 349.0
## 460 __ToR__ 77.61 78.0 4579 34.50 59 284.9 315.0
## 302 Light 92.00 100.0 4968 30.00 54 324.0 348.0
## Variables not shown: iqrdays (dbl), sddays (dbl)
leagues.summary <- subset(leagues.summary.all, League != "")
Понятно что клан версус не клан довольно бессмысленный стат сам по себе, лучше бы смотреть в динамике. Но, например, высокоуровневых неклановых игроков можно бы.
Средний уровень vs количество игроков в клане
plot(leagues.summary$npeople, leagues.summary$avg)
Тупая разбивка лиг:
lssc <- sapply(leagues.summary[, 2:6], scale)
leagues.summary$cluster <- as.factor(kmeans(lssc, 3)$cluster)
leagues.summary$clusterdays <- kmeans(scale(leagues.summary$iqrdays), 3)$cluster
qplot(npeople, avg, color = as.factor(cluster), data = leagues.summary)
qplot(npeople, med, color = as.factor(cluster), data = leagues.summary)
гетерогенность
qplot(npeople, iqr, color = as.factor(cluster), data = leagues.summary)
arrange(leagues.summary, desc(iqrdays))$clusterdays
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [35] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [69] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [103] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [137] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [171] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [205] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [239] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [273] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [307] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [341] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [375] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [409] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [443] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [477] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [511] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [545] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [579] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [613] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [647] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [681] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [715] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [749] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [783] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [817] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [851] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [885] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [919] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [953] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [987] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [1021] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
qplot(npeople, med, color = as.factor(clusterdays), data = leagues.summary)
Вышел ли хороший линейный предиктор?
dfl <- merge(df, leagues.summary[, c("League", "cluster", "clusterdays")], all.x = T)
dfl <- subset(dfl, !(is.na(cluster)))
dfl$cluster <- as.factor(dfl$cluster)
dfl$clusterdays <- as.factor(dfl$clusterdays)
summary(lm(dfl$Level ~ dfl$cluster + dfl$clusterdays + dfl$daysreg))
##
## Call:
## lm(formula = dfl$Level ~ dfl$cluster + dfl$clusterdays + dfl$daysreg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.61 -6.94 -1.74 7.84 69.31
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.04968 1.62381 25.28 < 2e-16 ***
## dfl$cluster2 -30.18991 0.84039 -35.92 < 2e-16 ***
## dfl$cluster3 20.06935 0.93898 21.37 < 2e-16 ***
## dfl$clusterdays2 -8.25136 0.94436 -8.74 < 2e-16 ***
## dfl$clusterdays3 -9.39993 1.34617 -6.98 3.7e-12 ***
## dfl$daysreg 0.08356 0.00454 18.41 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.1 on 2368 degrees of freedom
## Multiple R-squared: 0.735, Adjusted R-squared: 0.734
## F-statistic: 1.31e+03 on 5 and 2368 DF, p-value: <2e-16
Что неудивительно, так как кланы кластеризовали в т.ч. по среднему уровню
qplot(meddays, npeople, color = as.factor(iqrdays), data = leagues.summary)
dflrpart <- dfl[, c("Type", "Status", "Level", "Faction", "regdate", "daysreg",
"cluster", "clusterdays")]
rp <- rpart(Level ~ ., data = dflrpart)
print(rp)
## n= 2374
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 2374 1780000 49.44
## 2) cluster=2 1340 101900 30.52 *
## 3) cluster=1,3 1034 576100 73.97
## 6) cluster=1 486 201300 62.53
## 12) regdate>=1.358e+09 65 22740 46.08 *
## 13) regdate< 1.358e+09 421 158200 65.08 *
## 7) cluster=3 548 255000 84.11
## 14) regdate>=1.361e+09 84 50250 58.26 *
## 15) regdate< 1.361e+09 464 138500 88.78 *
library(pvclust)
# http://www.statmethods.net/advstats/cluster.html
fithc2 <- pvclust(lssc, method.hclust = "ward", method.dist = "euclidean")
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.7)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 0.9)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.1)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.3)... Done.
## Bootstrap (r = 1.4)... Done.
plot(fithc2) # dendogram with p values
# add rectangles around groups highly supported by the data
pvrect(fithc2, alpha = 0.95)
library(mclust)
## Package 'mclust' version 4.2
fitmc <- Mclust(lssc)
## Warning: best model occurs at the min or max # of components considered
## Warning: optimal number of clusters occurs at max choice
plot(fitmc) # plot results
summary(fitmc) # display the best model
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EEV (ellipsoidal, equal volume and shape) model with 9 components:
##
## log.likelihood n df BIC ICL
## 2333 1051 148 3636 3515
##
## Clustering table:
## 1 2 3 4 5 6 7 8 9
## 19 20 29 2 5 9 6 907 54