#load packages
library(tidyverse)
library(janitor)
library(skimr)
library(lubridate)
library(Hmisc)
library(hrbrthemes)
library(RColorBrewer)
library(waffle)
library(ggpubr)
library(cowplot)
library(viridis)
#from tidytuesday
##clean dataset from lizawood's github
url <- "https://raw.githubusercontent.com/lizawood/apps-and-games/master/PC_Games/PCgames_2004_2018_raw.csv"
##read in raw data
raw_df <- url %>%
read_csv() %>%
janitor::clean_names()
#factors and playtime data
clean_df <- raw_df %>%
mutate(price = as.numeric(price),
score_rank = word(score_rank_userscore_metascore, 1),
average_playtime = word(playtime_median, 1),
median_playtime = word(playtime_median, 2),
median_playtime = str_remove(median_playtime, "\\("),
median_playtime = str_remove(median_playtime, "\\)"),
average_playtime = 60 * as.numeric(str_sub(average_playtime, 1, 2)) +
as.numeric(str_sub(average_playtime, 4, 5)),
median_playtime = 60 * as.numeric(str_sub(median_playtime, 1, 2)) +
as.numeric(str_sub(median_playtime, 4, 5)),
metascore = as.double(str_sub(score_rank_userscore_metascore, start = -4, end = -3))) %>%
select(-score_rank_userscore_metascore, -score_rank, -playtime_median) %>%
rename(publisher = publisher_s, developer = developer_s) %>%
mutate(release_date = mdy(release_date))
## summary
cdf <- clean_df %>%
mutate(
release_year = year(release_date),
release_month = month(release_date)
)
cdf = cdf %>% mutate_at (vars(game, owners, developer, publisher, release_year, release_month), list(factor))
skim(cdf)
| Name | cdf |
| Number of rows | 26688 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| Date | 1 |
| factor | 6 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| release_date | 1 | 1 | 2004-01-11 | 2018-12-31 | 2017-06-09 | 2951 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| game | 3 | 1.00 | FALSE | 26610 | Dar: 3, 18 : 2, Alt: 2, Ash: 2 |
| owners | 0 | 1.00 | FALSE | 13 | 0 .: 17923, 20,: 3126, 50,: 1733, 100: 1408 |
| developer | 151 | 0.99 | FALSE | 16752 | Cho: 92, KOE: 70, Rip: 62, SEG: 60 |
| publisher | 95 | 1.00 | FALSE | 13953 | Big: 284, SEG: 141, Str: 129, Ubi: 123 |
| release_year | 1 | 1.00 | FALSE | 15 | 201: 8846, 201: 6872, 201: 4504, 201: 2544 |
| release_month | 1 | 1.00 | FALSE | 12 | 10: 2684, 9: 2588, 11: 2557, 8: 2469 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| number | 0 | 1.00 | 2904.19 | 2377.07 | 1.00 | 821.00 | 2356.00 | 4523.25 | 8846.00 | ▇▅▃▂▁ |
| price | 3095 | 0.88 | 8.95 | 12.16 | 0.49 | 2.99 | 5.99 | 9.99 | 595.99 | ▇▁▁▁▁ |
| average_playtime | 9 | 1.00 | 9.06 | 117.94 | 0.00 | 0.00 | 0.00 | 0.00 | 5670.00 | ▇▁▁▁▁ |
| median_playtime | 12 | 1.00 | 5.16 | 68.91 | 0.00 | 0.00 | 0.00 | 0.00 | 3293.00 | ▇▁▁▁▁ |
| metascore | 23838 | 0.11 | 71.90 | 11.02 | 20.00 | 66.00 | 73.00 | 80.00 | 98.00 | ▁▁▃▇▂ |
# drop ob with no date
cdf2 = cdf[!is.na(cdf$release_date),]
#new variable based on owners > 20,000
cdf2$high_owners = if_else(cdf2$owners == "0 .. 20,000", "0", "1")
cdf2$high_owners = as.factor(cdf2$high_owners)
Hmisc:: describe(cdf2$high_owners) #32.8% have high owners
## cdf2$high_owners
## n missing distinct
## 26687 0 2
##
## Value 0 1
## Frequency 17923 8764
## Proportion 0.672 0.328
Games released by month
gmonth = cdf2 %>% filter(!is.na(release_month)) %>% filter(!is.na(release_year)) %>% group_by(release_year, release_month) %>% tally() %>% rename(monthn = n)
p1 <-ggplot(gmonth, aes(x=release_month, y=monthn, color=monthn)) +
geom_boxplot() + coord_flip() +
theme_light() +
theme(
panel.grid.major.x = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.x = element_blank()
) +
scale_fill_viridis() +
xlab("Month") +
ylab("Count of games released") +
labs(title="Games released by month", subtitle="Steam games released in 2004 to 2018")
p1
*There are more games released in the last quarter of the year, coinciding with the holiday period.
Most common game prices
#count of unique game prices
n_distinct(cdf2$price, na.rm = TRUE)
## [1] 199
summary(cdf2$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.490 2.990 5.990 8.947 9.990 595.990 3094
p2 = cdf2 %>% filter(!is.na(price)) %>% group_by(price) %>% tally() %>% rename(pri = n) %>%
arrange(desc(pri)) %>% slice(1:10) %>%
ggplot() + geom_col(aes(x=as.factor(price), y=pri, fill=pri)) +
theme_light() +
theme(
panel.grid.major.x = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.x = element_blank()
) +
scale_fill_viridis() +
xlab("Price in USD") +
ylab("") +
labs(title="10 Most common game prices on Steam", subtitle="Steam games released in 2004 to 2018", fill="Frequency")
p2
Count of games by owners levels
#summary by owners (13 levels)
cdf2$owners = factor(cdf2$owners, levels = c("0 .. 20,000", "20,000 .. 50,000", "50,000 .. 100,000", "100,000 .. 200,000", "200,000 .. 500,000", "500,000 .. 1,000,000", "1,000,000 .. 2,000,000", "2,000,000 .. 5,000,000", "5,000,000 .. 10,000,000", "10,000,000 .. 20,000,000", "20,000,000 .. 50,000,000", "50,000,000 .. 100,000,000", "100,000,000 .. 200,000,000"))
perc = cdf2 %>% count(factor(owners)) %>% mutate(perc = n*100/sum(n))
perc
#plot
p3 = cdf2 %>% filter(!is.na(owners)) %>% count(owners) %>% rename(owners_n = n) %>% arrange(desc(owners_n)) %>%
ggplot( aes(x=owners, y=owners_n) ) +
geom_segment( aes(x=owners ,xend=owners, y=0, yend=owners_n), color="#495867") +
geom_point(size=2, color="#495867") + geom_text(aes(label=owners_n, hjust= -1)) +
coord_flip() +
theme_light() +
theme(
panel.grid.major.x = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.x = element_blank()
) +
xlab("") +
ylab("") +
labs(title="Count of games by owners levels")
p3
gamet = cdf2 %>% select(game, owners) %>% filter(owners %in% c("100,000,000 .. 200,000,000", "50,000,000 .. 100,000,000", "20,000,000 .. 50,000,000" )) %>% arrange(owners)
gamet
High owner games by year * As majority (67.16%) of the games on steam have below 20,000 owners, the following section uses a dummy variable i.e., high owners (>20,000) to look at the changes over the years.
#plot percentage of high owners games throughout the years
d2= cdf2 %>% group_by(release_year, high_owners) %>%
summarise(count=n()) %>%
mutate(perc= count/sum(count))
p4= ggplot(d2, aes(x=factor(release_year), y = perc, fill = factor (high_owners))) +
geom_bar(stat="identity",width= 0.5) +
labs(x= "release_year", y ="percent", fill = "high_owners") +
scale_y_continuous(labels = scales::percent) +
scale_fill_manual("High owners", values = c ("1" = "#d8973c", "0" = "#d6d5c9")) +
theme_light() +
theme(
axis.text.x = element_text(angle = 45, hjust=0.5),
panel.grid.major.x = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
) +
xlab("") +
ylab("") +
labs(title="Percentage of high owner games (>20,000)")
p4
Games release year
p5 = ggplot(cdf2, aes(as.factor(release_year))) +
geom_bar(fill= "#606c38") +
theme_light() +
theme(
panel.grid.major.x = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
) +
xlab("") +
ylab("") +
labs(title = "Games by year")
p5
avg playtime and release year
s1<-cdf2 %>% filter(median_playtime >0) %>% filter(average_playtime >0)
p6 <-cdf2 %>% filter(median_playtime >0) %>% filter(average_playtime >0) %>%
ggplot(aes(x=as.factor(release_year), y=average_playtime)) +
geom_boxplot() + coord_flip() +
theme_light() +
theme(
panel.grid.major.x = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.x = element_blank(),
legend.position= "none"
) +
xlab("") +
ylab("Average playtime") +
labs(title="Average 2-week playtime of active games by release year")
p6
Metascore by year (Reference: @david_craryon)
median_values <- cdf2 %>%
group_by(release_year) %>%
summarise(
med = median(metascore, na.rm = TRUE),
mean = median(metascore, na.rm = TRUE)
)
p7 = ggplot(cdf2, aes(x = release_year, y = metascore)) +
geom_boxplot(aes(color = release_year)) +
geom_point(data = median_values, aes(x = release_year, y = med, fill = release_year), shape = 23, size = 4, color = "black") +
guides(fill = FALSE, color = FALSE) +
coord_flip() +
labs(y = "Metascore", x = "", title = "Metascore by year") +
theme_classic() +
scale_color_viridis_d(option = "cividis") +
scale_fill_viridis_d(option = "cividis")
p7
Metascore by number of owners (Reference: david_craryon)
median_values <- cdf2 %>%
group_by(owners) %>%
summarise(
med = median(metascore, na.rm = TRUE),
mean = median(metascore, na.rm = TRUE)
)
p8 = ggplot(cdf2, aes(x = owners, y = metascore)) +
geom_boxplot(aes(color = owners)) +
geom_point(data = median_values, aes(x = owners, y = med, fill = owners), shape = 23, size = 4, color = "black") +
guides(fill = FALSE, color = FALSE) +
coord_flip() +
labs(y = "Metascore", x = "# of owners", title = "Metascore values by owners level") +
theme_classic() +
scale_color_viridis_d(option="cividis") +
scale_fill_viridis_d(option="cividis")
p8
Largest developers
#games released
p9 = cdf2 %>% filter(!is.na(developer)) %>% count(developer) %>% rename(devn = n) %>% arrange(desc(devn)) %>% slice(1:10) %>%
ggplot(aes(x=developer, y=devn) ) +
geom_segment(aes(x=reorder(developer,devn),xend=developer, y=0, yend=devn), color="#495867") +
geom_point(size=2, color="#495867") + scale_y_continuous(limits = c(0, 100)) +
coord_flip() +
theme_light() +
theme(
panel.grid.major.x = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.x = element_blank()
) +
xlab("") +
ylab("") +
labs(title="Largest developers by games released")
#games released with >20,000 owners
p10= cdf2 %>% filter(high_owners==1) %>% filter(!is.na(developer)) %>%
count(developer) %>% rename(devn = n) %>% arrange(desc(devn)) %>% slice(1:10) %>%
ggplot( aes(x=developer, y=devn) ) +
geom_segment( aes(x=reorder(developer,devn) ,xend=developer, y=0, yend=devn), color="#495867") +
geom_point(size=2, color="#495867") + scale_y_continuous(limits = c(0, 100)) +
coord_flip() +
theme_light() +
theme(
panel.grid.major.x = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.x = element_blank()
) +
xlab("") +
ylab("") +
labs(title="Largest developers by games released with >20,000 owners")
#plot
plot_grid(p9, p10, ncol = 1, align = "v")
Largest developer and distribution of owners (Reference: Christopher Yee)
# create new variable: age, min/max_owner, remove games with wrong dates, studio games with prices >150
cdf3 = cdf2 %>%
mutate(release_year = as.numeric(str_trim(release_year)),
release_year = case_when(release_year == 5 ~ 2015, # INCORRECT DATA POINT
TRUE ~ release_year),
age = 2019 - release_year) %>%
mutate(max_owners = str_trim(word(owners, 2, sep = "\\..")),
max_owners = as.numeric(str_replace_all(max_owners, ",", "")),
min_owners = str_trim(word(owners, 1, sep = "\\..")),
min_owners = as.numeric(str_replace_all(min_owners, ",", "")))
p11= cdf3 %>%
select(-number, -release_date) %>%
mutate(max_owners = as.factor(max_owners / 1000000)) %>%
group_by(developer) %>%
mutate(n = n()) %>%
ungroup() %>%
filter(n >= 43, developer != 'NA') %>%
mutate(developer = paste0(developer, " (", n, ")"),
developer= reorder(developer, (n))) %>%
ggplot(aes(developer, max_owners, color = developer)) +
geom_jitter(show.legend = FALSE, size = 2.5, alpha = 0.5) +
scale_color_brewer(palette = 'BrBG') +
theme_light() + coord_flip() +
labs(x = "Developer (total games released)",
y = "Estimated game owners per million",
title = "Distribution of owners levels by developer") +
theme(
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.x = element_blank(),
plot.title = element_text(size=11)
)
p11
#games released
p12= cdf2 %>% filter(!is.na(publisher)) %>% count(publisher) %>% rename(pubn = n) %>% arrange(desc(pubn)) %>% slice(1:10) %>%
ggplot(aes(x=publisher, y=pubn) ) +
geom_segment(aes(x=reorder(publisher,pubn),xend=publisher, y=0, yend=pubn), color="#495867") +
geom_point(size=2, color="#495867") + scale_y_continuous(limits = c(0, 300)) +
coord_flip() +
theme_light() +
theme(
panel.grid.major.x = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.x = element_blank()
) +
xlab("") +
ylab("") +
labs(title="Largest publishers by games released")
#games released with >20,000 owners
p13= cdf2 %>% filter(high_owners==1) %>% count(publisher) %>% rename(pubn = n) %>% arrange(desc(pubn)) %>% slice(1:10) %>%
ggplot(aes(x=publisher, y=pubn) ) +
geom_segment(aes(x=reorder(publisher,pubn),xend=publisher, y=0, yend=pubn), color="#495867") +
geom_point(size=2, color="#495867") + scale_y_continuous(limits = c(0, 300)) +
coord_flip() +
theme_light() +
theme(
panel.grid.major.x = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.x = element_blank()
) +
xlab("") +
ylab("") +
labs(title="Largest publishers by games released with >20,000 owners")
#plot
plot_grid(p12, p13, ncol = 1, align = "v")