df1 <- fread("Steam_games_2024_cleaned.csv")
df2 <- fread("applicationGenres.csv")
# 挑选出两个游戏(Cities: Skylines, RimWorld),并提取出这两个游戏的AppID
df1 %>% select(AppID, Name) %>%
filter(Name %in% c("Cities: Skylines", "RimWorld"))
## AppID Name
## <int> <char>
## 1: 255710 Cities: Skylines
## 2: 294100 RimWorld
# 提取出这两个游戏在线时间人数的数据
df.game1 <- fread("Online Player Counts/255710.csv")
df.game2 <- fread("Online Player Counts/294100.csv")
目的:分析时间序列特征,并构建模型来对序列未来的数值进行预测。
df.game1 %>%
mutate(Playercount = ifelse(Playercount == 0, NA, Playercount)) %>%
mutate(Time = ymd_hm(Time)) -> df.game1
df.game1 %>%
ggplot() +
geom_line(aes(x = Time, y = Playercount), color = "darkgreen") +
theme_pander() +
ggtitle("Cities: Skylines") +
ylab("Online Players") +
xlab("Time")
df.game1 %>%
filter(is.na(Playercount)) %>%
nrow()
## [1] 251
使用滑动均值的方式填充序列中的缺失值。
df.game1$Playercount <- na_ma(df.game1$Playercount, k = 4)
由于这个序列具有很明显的周期性,而且还存在多个明显的周期(一天/24小时,一周/7*24小时),因此可以在构建预测模型时加入这些周期的dummy variable,并拟合得到相应的系数。此外,这个序列还存在一定的随时间变化的趋势,因此可以将时间变量t加入模型中。
df.game1 %>%
mutate(Hour = hour(Time) %>% as.factor(),
Week = weekdays(Time) %>% as.factor(),
Month = month(Time) %>% as.factor()) %>%
slice((n() - 1000+1):n()) -> df.game1.sub
df.game1.sub.tr <- df.game1.sub %>% slice(1:700)
df.game1.sub.te <- df.game1.sub %>% slice(701:1000)
model.lm <- lm(formula = Playercount ~ Time + Hour + Week,
data = df.game1.sub.tr)
model.lm.sum <- summary(model.lm)
pred.te <- predict(model.lm, df.game1.sub.te)
df.game1.sub.te %>%
mutate(Playercount_predicted = pred.te) %>%
ggplot() +
geom_line(data = df.game1.sub.tr, aes(x = Time, y = Playercount, color = "Real")) +
geom_line(aes(x = Time, y = Playercount, color = "Real")) +
geom_line(aes(x = Time, y = Playercount_predicted, color = "Predicted")) +
geom_ribbon(aes(x = Time,
xmin = df.game1.sub.te$Time %>% min(),
xmax = df.game1.sub.te$Time %>% max(),
ymin = df.game1.sub.te$Playercount %>% min(),
ymax = df.game1.sub.te$Playercount %>% max()), alpha = 0.2,
fill = "skyblue", linewidth = 1) +
scale_color_manual(breaks = c("Real", "Predicted"),
values = c("darkgreen", "darkorange")) +
theme_pander() +
theme(legend.position = "bottom") +
labs(color = "") +
ggtitle("Real vs Forecasted Online Players for 'Cities: Skylines'")
计算这个模型预测序列最后300个小时数据的误差,并使用MAE、RMSE、MSE、MAPE、R2这5个指标综合评估。
sprintf("MAE: %.2f\n", mean(abs(df.game1.sub.te$Playercount - pred.te))) %>% cat()
## MAE: 1163.90
sprintf("RMSE: %.2f\n", mean((df.game1.sub.te$Playercount - pred.te) ** 2) ** (1/2)) %>% cat()
## RMSE: 1320.81
sprintf("MSE: %.2f\n", mean((df.game1.sub.te$Playercount - pred.te) ** 2)) %>% cat()
## MSE: 1744535.13
sprintf("MAPE: %.2f%%\n", 100 * mean(abs(df.game1.sub.te$Playercount - pred.te) /
df.game1.sub.te$Playercount)) %>% cat()
## MAPE: 8.92%
sprintf("R2: %.2f\n", cor(df.game1.sub.te$Playercount, pred.te) ** 2) %>% cat()
## R2: 0.91
df.game2 %>%
mutate(Playercount = ifelse(Playercount == 0, NA, Playercount)) %>%
mutate(Time = ymd_hm(Time)) -> df.game2
df.game2 %>%
ggplot() +
geom_line(aes(x = Time, y = Playercount), color = "darkgreen") +
theme_pander() +
ggtitle("RimWorld") +
ylab("Online Players") +
xlab("Time")
df.game2 %>%
filter(is.na(Playercount)) %>%
nrow()
## [1] 302
使用滑动均值的方式填充序列中的缺失值。
df.game2$Playercount <- na_ma(df.game2$Playercount, k = 4)
同样,这个序列也具有很明显的周期性,而且还存在多个明显的周期(一天/24小时,一周/7*24小时)。 可以在构建预测模型时加入这些周期的dummy variable。 此外,在较短的一段时间内(若干个月),这个序列还存在一定的随时间变化的趋势,因此将时间变量t加入模型中。
df.game2 %>%
mutate(Hour = hour(Time) %>% as.factor(),
Week = weekdays(Time) %>% as.factor(),
Month = month(Time) %>% as.factor()) %>%
slice((n() - 1000+1):n()) -> df.game2.sub
df.game2.sub.tr <- df.game2.sub %>% slice(1:700)
df.game2.sub.te <- df.game2.sub %>% slice(701:1000)
model.lm <- lm(formula = Playercount ~ Time + Hour + Week,
data = df.game2.sub.tr)
model.lm.sum <- summary(model.lm)
pred.te <- predict(model.lm, df.game2.sub.te)
df.game2.sub.te %>%
mutate(Playercount_predicted = pred.te) %>%
ggplot() +
geom_line(data = df.game2.sub.tr, aes(x = Time, y = Playercount, color = "Real")) +
geom_line(aes(x = Time, y = Playercount, color = "Real")) +
geom_line(aes(x = Time, y = Playercount_predicted, color = "Predicted")) +
geom_ribbon(aes(x = Time,
xmin = df.game2.sub.te$Time %>% min(),
xmax = df.game2.sub.te$Time %>% max(),
ymin = df.game2.sub.te$Playercount %>% min(),
ymax = df.game2.sub.te$Playercount %>% max()), alpha = 0.2,
fill = "skyblue", linewidth = 1) +
scale_color_manual(breaks = c("Real", "Predicted"),
values = c("darkgreen", "darkorange")) +
theme_pander() +
theme(legend.position = "bottom") +
labs(color = "") +
ggtitle("Real vs Forecasted Online Players for 'RimWorld'")
计算这个模型预测序列最后300个小时数据的误差,并使用MAE、RMSE、MSE、MAPE、R2这5个指标综合评估。
sprintf("MAE: %.2f\n", mean(abs(df.game2.sub.te$Playercount - pred.te))) %>% cat()
## MAE: 922.78
sprintf("RMSE: %.2f\n", mean((df.game2.sub.te$Playercount - pred.te) ** 2) ** (1/2)) %>% cat()
## RMSE: 1058.53
sprintf("MSE: %.2f\n", mean((df.game2.sub.te$Playercount - pred.te) ** 2)) %>% cat()
## MSE: 1120494.21
sprintf("MAPE: %.2f%%\n", 100 * mean(abs(df.game2.sub.te$Playercount - pred.te) /
df.game2.sub.te$Playercount)) %>% cat()
## MAPE: 5.95%
sprintf("R2: %.2f\n", cor(df.game2.sub.te$Playercount, pred.te) ** 2) %>% cat()
## R2: 0.82
目标:分析最近100天每种类型游戏在线人数的走势情况。
df.genres <- fread("applicationGenres.csv")
df.genres %>%
filter(Genres != "") %>%
group_by(Genres) %>%
summarise(N = n()) -> df.genres.sum
游戏数量大于50的游戏类型有以下8种,其中Action类的游戏数量最多,Free to Play类别的游戏数量最少。
df.genres.sum %>%
filter(N > 50) %>%
ggplot() +
geom_col(aes(x = reorder(Genres, -N), y = N),
fill = "darkgreen", width = 0.5) +
theme_pander() +
xlab("Genres") +
ylab("Number of Games") +
ggtitle("Distribution of Games by Genres")
使用每种游戏在线人数数据中最后100天的数据拟合一个线性回归模型(以在线人数为自变量,时间为因变量且包含一个截距项),并提取出时间变量的系数的t值。根据t值的正负和大小即可判定是否一个游戏近100天的变化趋势。若t值为正,且大于1.96,则可说明该游戏近100天的在线人数呈显著增长趋势。若t值为负,且小于1.96,则可说明该游戏近100天的在线人数呈显著降低趋势。
df.genres.sum %>%
filter(N > 50) %>%
pull(Genres) -> Genres_selected
df.genres %>%
filter(Genres %in% Genres_selected) -> df.genres.sub
t_value_vec <- rep(NA, nrow(df.genres.sub))
for (i in 1:nrow(df.genres.sub)){
if (i %% 100 == 0){
print(i)
}
if (!file.exists(sprintf("Online Player Counts/%d.csv", df.genres.sub$AppID[i]))){
next
}
df.game <- fread(sprintf("Online Player Counts/%d.csv", df.genres.sub$AppID[i]))
df.game %>%
slice((n() - 24*100+1):n()) %>%
mutate(index = 1:n()) -> df.game.sub
model.lm <- lm(formula = Playercount ~ index,
data = df.game.sub)
model.lm.sum <- summary(model.lm)
t_value <- model.lm.sum$coefficients["index", "t value"]
t_value_vec[i] <- t_value
}
saveRDS(t_value_vec, "t_value_vec.RDS")
t_value_vec <- readRDS("t_value_vec.RDS")
df.genres.sub %>%
select(AppID, Genres) %>%
mutate(t_value = t_value_vec) %>%
na.omit() %>%
group_by(Genres) %>%
summarise(t_value = mean(t_value),
.groups = "drop") %>%
ggplot() +
geom_col(aes(x = reorder(Genres, t_value), y = t_value), width = 0.5,
fill = "darkgreen") +
ylab("t value") +
xlab("Genre") +
theme_pander()
由各种游戏类型t值均值的图像可以看出,在最近100天内,各种游戏的在线人数都呈下降趋势。其中”Free to Play”,“Strategy”, “RPG”这三种类型游戏在线人数的下降趋势较为明显。“Indie”, “Adventure”, “Action”这三种类型游戏的在线人数有轻微的下降趋势。