1 Load Datasets

df1 <- fread("Steam_games_2024_cleaned.csv")
df2 <- fread("applicationGenres.csv")
# 挑选出两个游戏(Cities: Skylines, RimWorld),并提取出这两个游戏的AppID
df1 %>% select(AppID, Name) %>% 
  filter(Name %in% c("Cities: Skylines", "RimWorld"))
##     AppID             Name
##     <int>           <char>
## 1: 255710 Cities: Skylines
## 2: 294100         RimWorld
# 提取出这两个游戏在线时间人数的数据
df.game1 <- fread("Online Player Counts/255710.csv")
df.game2 <- fread("Online Player Counts/294100.csv")

2 时间序列建模

目的:分析时间序列特征,并构建模型来对序列未来的数值进行预测。

2.1 游戏1;Cities: Skylines

df.game1 %>% 
  mutate(Playercount = ifelse(Playercount == 0, NA, Playercount)) %>% 
  mutate(Time = ymd_hm(Time)) -> df.game1

df.game1 %>% 
  ggplot() + 
  geom_line(aes(x = Time, y = Playercount), color = "darkgreen") + 
  theme_pander() + 
  ggtitle("Cities: Skylines") + 
  ylab("Online Players") + 
  xlab("Time")

2.1.1 Impute Missing Values

df.game1 %>% 
  filter(is.na(Playercount)) %>% 
  nrow()
## [1] 251

使用滑动均值的方式填充序列中的缺失值。

df.game1$Playercount <- na_ma(df.game1$Playercount, k = 4)

2.1.2 Model

由于这个序列具有很明显的周期性,而且还存在多个明显的周期(一天/24小时,一周/7*24小时),因此可以在构建预测模型时加入这些周期的dummy variable,并拟合得到相应的系数。此外,这个序列还存在一定的随时间变化的趋势,因此可以将时间变量t加入模型中。

df.game1 %>% 
  mutate(Hour = hour(Time) %>% as.factor(), 
         Week = weekdays(Time) %>% as.factor(), 
         Month = month(Time) %>% as.factor()) %>% 
  slice((n() - 1000+1):n()) -> df.game1.sub
df.game1.sub.tr <- df.game1.sub %>% slice(1:700)
df.game1.sub.te <- df.game1.sub %>% slice(701:1000)
model.lm <- lm(formula = Playercount ~ Time + Hour + Week, 
               data = df.game1.sub.tr)
model.lm.sum <- summary(model.lm)
pred.te <- predict(model.lm, df.game1.sub.te)
df.game1.sub.te %>% 
  mutate(Playercount_predicted = pred.te) %>% 
  ggplot() + 
  geom_line(data = df.game1.sub.tr, aes(x = Time, y = Playercount, color = "Real")) + 
  geom_line(aes(x = Time, y = Playercount, color = "Real")) + 
  geom_line(aes(x = Time, y = Playercount_predicted, color = "Predicted")) + 
  geom_ribbon(aes(x = Time, 
                  xmin = df.game1.sub.te$Time %>% min(), 
                  xmax = df.game1.sub.te$Time %>% max(), 
                  ymin = df.game1.sub.te$Playercount %>% min(), 
                  ymax = df.game1.sub.te$Playercount %>% max()), alpha = 0.2, 
              fill = "skyblue", linewidth = 1) + 
  scale_color_manual(breaks = c("Real", "Predicted"), 
                     values = c("darkgreen", "darkorange")) + 
  theme_pander() + 
  theme(legend.position = "bottom") + 
  labs(color = "") + 
  ggtitle("Real vs Forecasted Online Players for 'Cities: Skylines'")

2.1.3 Model Evaluation

计算这个模型预测序列最后300个小时数据的误差,并使用MAE、RMSE、MSE、MAPE、R2这5个指标综合评估。

sprintf("MAE: %.2f\n", mean(abs(df.game1.sub.te$Playercount - pred.te))) %>% cat()
## MAE: 1163.90
sprintf("RMSE: %.2f\n", mean((df.game1.sub.te$Playercount - pred.te) ** 2) ** (1/2)) %>% cat()
## RMSE: 1320.81
sprintf("MSE: %.2f\n", mean((df.game1.sub.te$Playercount - pred.te) ** 2)) %>% cat()
## MSE: 1744535.13
sprintf("MAPE: %.2f%%\n", 100 * mean(abs(df.game1.sub.te$Playercount - pred.te) / 
                                       df.game1.sub.te$Playercount)) %>% cat()
## MAPE: 8.92%
sprintf("R2: %.2f\n", cor(df.game1.sub.te$Playercount, pred.te) ** 2) %>% cat()
## R2: 0.91

2.2 游戏2;RimWorld

df.game2 %>% 
  mutate(Playercount = ifelse(Playercount == 0, NA, Playercount)) %>% 
  mutate(Time = ymd_hm(Time)) -> df.game2

df.game2 %>% 
  ggplot() + 
  geom_line(aes(x = Time, y = Playercount), color = "darkgreen") + 
  theme_pander() + 
  ggtitle("RimWorld") + 
  ylab("Online Players") + 
  xlab("Time")

2.2.1 Impute Missing Values

df.game2 %>% 
  filter(is.na(Playercount)) %>% 
  nrow()
## [1] 302

使用滑动均值的方式填充序列中的缺失值。

df.game2$Playercount <- na_ma(df.game2$Playercount, k = 4)

2.2.2 Model

同样,这个序列也具有很明显的周期性,而且还存在多个明显的周期(一天/24小时,一周/7*24小时)。 可以在构建预测模型时加入这些周期的dummy variable。 此外,在较短的一段时间内(若干个月),这个序列还存在一定的随时间变化的趋势,因此将时间变量t加入模型中。

df.game2 %>% 
  mutate(Hour = hour(Time) %>% as.factor(), 
         Week = weekdays(Time) %>% as.factor(), 
         Month = month(Time) %>% as.factor()) %>% 
  slice((n() - 1000+1):n()) -> df.game2.sub
df.game2.sub.tr <- df.game2.sub %>% slice(1:700)
df.game2.sub.te <- df.game2.sub %>% slice(701:1000)
model.lm <- lm(formula = Playercount ~ Time + Hour + Week, 
               data = df.game2.sub.tr)
model.lm.sum <- summary(model.lm)
pred.te <- predict(model.lm, df.game2.sub.te)
df.game2.sub.te %>% 
  mutate(Playercount_predicted = pred.te) %>% 
  ggplot() + 
  geom_line(data = df.game2.sub.tr, aes(x = Time, y = Playercount, color = "Real")) + 
  geom_line(aes(x = Time, y = Playercount, color = "Real")) + 
  geom_line(aes(x = Time, y = Playercount_predicted, color = "Predicted")) + 
  geom_ribbon(aes(x = Time, 
                  xmin = df.game2.sub.te$Time %>% min(), 
                  xmax = df.game2.sub.te$Time %>% max(), 
                  ymin = df.game2.sub.te$Playercount %>% min(), 
                  ymax = df.game2.sub.te$Playercount %>% max()), alpha = 0.2, 
              fill = "skyblue", linewidth = 1) + 
  scale_color_manual(breaks = c("Real", "Predicted"), 
                     values = c("darkgreen", "darkorange")) + 
  theme_pander() + 
  theme(legend.position = "bottom") + 
  labs(color = "") + 
  ggtitle("Real vs Forecasted Online Players for 'RimWorld'")

2.2.3 Model Evaluation

计算这个模型预测序列最后300个小时数据的误差,并使用MAE、RMSE、MSE、MAPE、R2这5个指标综合评估。

sprintf("MAE: %.2f\n", mean(abs(df.game2.sub.te$Playercount - pred.te))) %>% cat()
## MAE: 922.78
sprintf("RMSE: %.2f\n", mean((df.game2.sub.te$Playercount - pred.te) ** 2) ** (1/2)) %>% cat()
## RMSE: 1058.53
sprintf("MSE: %.2f\n", mean((df.game2.sub.te$Playercount - pred.te) ** 2)) %>% cat()
## MSE: 1120494.21
sprintf("MAPE: %.2f%%\n", 100 * mean(abs(df.game2.sub.te$Playercount - pred.te) / 
                                       df.game2.sub.te$Playercount)) %>% cat()
## MAPE: 5.95%
sprintf("R2: %.2f\n", cor(df.game2.sub.te$Playercount, pred.te) ** 2) %>% cat()
## R2: 0.82

3 分类分析

目标:分析最近100天每种类型游戏在线人数的走势情况。

df.genres <- fread("applicationGenres.csv")
df.genres %>% 
  filter(Genres != "") %>% 
  group_by(Genres) %>% 
  summarise(N = n()) -> df.genres.sum

游戏数量大于50的游戏类型有以下8种,其中Action类的游戏数量最多,Free to Play类别的游戏数量最少。

df.genres.sum %>% 
  filter(N > 50) %>% 
  ggplot() + 
  geom_col(aes(x = reorder(Genres, -N), y = N), 
           fill = "darkgreen", width = 0.5) + 
  theme_pander() + 
  xlab("Genres") + 
  ylab("Number of Games") + 
  ggtitle("Distribution of Games by Genres")

使用每种游戏在线人数数据中最后100天的数据拟合一个线性回归模型(以在线人数为自变量,时间为因变量且包含一个截距项),并提取出时间变量的系数的t值。根据t值的正负和大小即可判定是否一个游戏近100天的变化趋势。若t值为正,且大于1.96,则可说明该游戏近100天的在线人数呈显著增长趋势。若t值为负,且小于1.96,则可说明该游戏近100天的在线人数呈显著降低趋势。

df.genres.sum %>% 
  filter(N > 50) %>% 
  pull(Genres) -> Genres_selected
df.genres %>% 
  filter(Genres %in% Genres_selected) -> df.genres.sub
t_value_vec <- rep(NA, nrow(df.genres.sub))
for (i in 1:nrow(df.genres.sub)){
  if (i %% 100 == 0){
    print(i)
  }
  if (!file.exists(sprintf("Online Player Counts/%d.csv", df.genres.sub$AppID[i]))){
    next
  }
  df.game <- fread(sprintf("Online Player Counts/%d.csv", df.genres.sub$AppID[i]))
  df.game %>% 
    slice((n() - 24*100+1):n()) %>% 
    mutate(index = 1:n()) -> df.game.sub
  model.lm <- lm(formula = Playercount ~ index, 
               data = df.game.sub)
  model.lm.sum <- summary(model.lm)
  t_value <- model.lm.sum$coefficients["index", "t value"]
  t_value_vec[i] <- t_value
}
saveRDS(t_value_vec, "t_value_vec.RDS")
t_value_vec <- readRDS("t_value_vec.RDS")
df.genres.sub %>% 
  select(AppID, Genres) %>% 
  mutate(t_value = t_value_vec) %>% 
  na.omit() %>% 
  group_by(Genres) %>% 
  summarise(t_value = mean(t_value), 
            .groups = "drop") %>% 
  ggplot() + 
  geom_col(aes(x = reorder(Genres, t_value), y = t_value), width = 0.5, 
           fill = "darkgreen") + 
  ylab("t value") + 
  xlab("Genre") + 
  theme_pander()

由各种游戏类型t值均值的图像可以看出,在最近100天内,各种游戏的在线人数都呈下降趋势。其中”Free to Play”,“Strategy”, “RPG”这三种类型游戏在线人数的下降趋势较为明显。“Indie”, “Adventure”, “Action”这三种类型游戏的在线人数有轻微的下降趋势。