Data source

Load library and set theme

#load packages
library(tidyverse)
library(janitor)
library(skimr)
library(lubridate)
library(Hmisc)
library(hrbrthemes)
library(RColorBrewer)
library(waffle)
library(ggpubr)
library(cowplot)
library(viridis)

Import data

#from tidytuesday 
##clean dataset from lizawood's github
url <- "https://raw.githubusercontent.com/lizawood/apps-and-games/master/PC_Games/PCgames_2004_2018_raw.csv"

##read in raw data
raw_df <- url %>% 
  read_csv() %>% 
  janitor::clean_names()

Data cleaning

#factors and playtime data
clean_df <- raw_df %>% 
  mutate(price = as.numeric(price),
         score_rank = word(score_rank_userscore_metascore, 1),
         average_playtime = word(playtime_median, 1),
         median_playtime = word(playtime_median, 2),
         median_playtime = str_remove(median_playtime, "\\("),
         median_playtime = str_remove(median_playtime, "\\)"),
         average_playtime = 60 * as.numeric(str_sub(average_playtime, 1, 2)) +
           as.numeric(str_sub(average_playtime, 4, 5)),
         median_playtime = 60 * as.numeric(str_sub(median_playtime, 1, 2)) +
           as.numeric(str_sub(median_playtime, 4, 5)),
         metascore = as.double(str_sub(score_rank_userscore_metascore, start = -4, end = -3))) %>% 
  select(-score_rank_userscore_metascore, -score_rank, -playtime_median) %>% 
    rename(publisher = publisher_s, developer = developer_s) %>%
  mutate(release_date = mdy(release_date))
## summary
cdf <- clean_df %>%
  mutate(
    release_year = year(release_date),
    release_month = month(release_date)
  )
cdf = cdf %>% mutate_at (vars(game, owners, developer, publisher, release_year, release_month), list(factor))
skim(cdf)
Data summary
Name cdf
Number of rows 26688
Number of columns 12
_______________________
Column type frequency:
Date 1
factor 6
numeric 5
________________________
Group variables None

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
release_date 1 1 2004-01-11 2018-12-31 2017-06-09 2951

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
game 3 1.00 FALSE 26610 Dar: 3, 18 : 2, Alt: 2, Ash: 2
owners 0 1.00 FALSE 13 0 .: 17923, 20,: 3126, 50,: 1733, 100: 1408
developer 151 0.99 FALSE 16752 Cho: 92, KOE: 70, Rip: 62, SEG: 60
publisher 95 1.00 FALSE 13953 Big: 284, SEG: 141, Str: 129, Ubi: 123
release_year 1 1.00 FALSE 15 201: 8846, 201: 6872, 201: 4504, 201: 2544
release_month 1 1.00 FALSE 12 10: 2684, 9: 2588, 11: 2557, 8: 2469

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
number 0 1.00 2904.19 2377.07 1.00 821.00 2356.00 4523.25 8846.00 ▇▅▃▂▁
price 3095 0.88 8.95 12.16 0.49 2.99 5.99 9.99 595.99 ▇▁▁▁▁
average_playtime 9 1.00 9.06 117.94 0.00 0.00 0.00 0.00 5670.00 ▇▁▁▁▁
median_playtime 12 1.00 5.16 68.91 0.00 0.00 0.00 0.00 3293.00 ▇▁▁▁▁
metascore 23838 0.11 71.90 11.02 20.00 66.00 73.00 80.00 98.00 ▁▁▃▇▂
# drop ob with no date 
cdf2 = cdf[!is.na(cdf$release_date),]
#new variable based on owners > 20,000
cdf2$high_owners = if_else(cdf2$owners == "0 .. 20,000", "0", "1")
cdf2$high_owners = as.factor(cdf2$high_owners)
Hmisc:: describe(cdf2$high_owners) #32.8% have high owners
## cdf2$high_owners 
##        n  missing distinct 
##    26687        0        2 
##                       
## Value          0     1
## Frequency  17923  8764
## Proportion 0.672 0.328

Visualization

Games released by month

gmonth = cdf2 %>% filter(!is.na(release_month)) %>% filter(!is.na(release_year)) %>% group_by(release_year, release_month) %>% tally() %>% rename(monthn = n)
p1 <-ggplot(gmonth, aes(x=release_month, y=monthn, color=monthn)) +
  geom_boxplot() + coord_flip() + 
  theme_light() +
    theme(
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.x = element_blank()
  ) +
  scale_fill_viridis() +
  xlab("Month") +
  ylab("Count of games released") +  
  labs(title="Games released by month", subtitle="Steam games released in 2004 to 2018")
p1

*There are more games released in the last quarter of the year, coinciding with the holiday period.

Most common game prices

#count of unique game prices
n_distinct(cdf2$price, na.rm = TRUE)
## [1] 199
summary(cdf2$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.490   2.990   5.990   8.947   9.990 595.990    3094
p2 = cdf2 %>% filter(!is.na(price)) %>% group_by(price) %>% tally() %>% rename(pri = n) %>% 
    arrange(desc(pri)) %>% slice(1:10) %>% 
    ggplot() + geom_col(aes(x=as.factor(price), y=pri, fill=pri)) + 
    theme_light() +
    theme(
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.x = element_blank()
  ) +
  scale_fill_viridis() +
  xlab("Price in USD") +
  ylab("") +  
  labs(title="10 Most common game prices on Steam", subtitle="Steam games released in 2004 to 2018", fill="Frequency")
p2

Count of games by owners levels

#summary by owners (13 levels)
cdf2$owners = factor(cdf2$owners, levels = c("0 .. 20,000", "20,000 .. 50,000", "50,000 .. 100,000", "100,000 .. 200,000", "200,000 .. 500,000", "500,000 .. 1,000,000", "1,000,000 .. 2,000,000", "2,000,000 .. 5,000,000", "5,000,000 .. 10,000,000", "10,000,000 .. 20,000,000", "20,000,000 .. 50,000,000", "50,000,000 .. 100,000,000", "100,000,000 .. 200,000,000"))
perc = cdf2 %>% count(factor(owners)) %>% mutate(perc = n*100/sum(n)) 
perc
#plot
p3 = cdf2 %>% filter(!is.na(owners)) %>% count(owners) %>% rename(owners_n = n) %>% arrange(desc(owners_n)) %>%
  ggplot( aes(x=owners, y=owners_n) ) +
    geom_segment( aes(x=owners ,xend=owners, y=0, yend=owners_n), color="#495867") +
    geom_point(size=2, color="#495867") + geom_text(aes(label=owners_n, hjust= -1)) +
    coord_flip() +
    theme_light() +
    theme(
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.x = element_blank()
  ) +
  xlab("") +
  ylab("") +
  labs(title="Count of games by owners levels")
p3

gamet = cdf2 %>% select(game, owners) %>% filter(owners %in% c("100,000,000 .. 200,000,000", "50,000,000 .. 100,000,000", "20,000,000 .. 50,000,000" )) %>% arrange(owners)
gamet

High owner games by year * As majority (67.16%) of the games on steam have below 20,000 owners, the following section uses a dummy variable i.e., high owners (>20,000) to look at the changes over the years.

#plot percentage of high owners games throughout the years
d2= cdf2 %>% group_by(release_year, high_owners) %>%
    summarise(count=n()) %>%
    mutate(perc= count/sum(count))

p4= ggplot(d2, aes(x=factor(release_year), y = perc, fill = factor (high_owners))) + 
  geom_bar(stat="identity",width= 0.5) + 
  labs(x= "release_year", y ="percent", fill = "high_owners") + 
  scale_y_continuous(labels = scales::percent) + 
  scale_fill_manual("High owners", values = c ("1" = "#d8973c", "0" = "#d6d5c9")) + 
  theme_light() +
  theme(
    axis.text.x = element_text(angle = 45, hjust=0.5),
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
  ) +
  xlab("") +
  ylab("") +  
  labs(title="Percentage of high owner games (>20,000)") 
p4

Games release year

p5 = ggplot(cdf2, aes(as.factor(release_year))) + 
  geom_bar(fill= "#606c38") + 
  theme_light() +
  theme(
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
  ) +
  xlab("") +
  ylab("") +  
  labs(title = "Games by year")
p5

avg playtime and release year

s1<-cdf2 %>% filter(median_playtime >0) %>% filter(average_playtime >0) 
p6 <-cdf2 %>% filter(median_playtime >0) %>% filter(average_playtime >0) %>% 
  ggplot(aes(x=as.factor(release_year), y=average_playtime)) +
  geom_boxplot() + coord_flip() + 
  theme_light() +
    theme(
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.x = element_blank(),
    legend.position= "none"
  ) +
  xlab("") +
  ylab("Average playtime") +  
  labs(title="Average 2-week playtime of active games by release year")
p6

Metascore

Metascore by year (Reference: @david_craryon)

median_values <- cdf2 %>%
  group_by(release_year) %>%
  summarise(
    med = median(metascore, na.rm = TRUE),
    mean = median(metascore, na.rm = TRUE)
  )
  
p7 = ggplot(cdf2, aes(x = release_year, y = metascore)) +
  geom_boxplot(aes(color = release_year)) +
  geom_point(data = median_values, aes(x = release_year, y = med, fill = release_year), shape = 23, size = 4, color = "black") +
  guides(fill = FALSE, color = FALSE) +
  coord_flip() +
  labs(y = "Metascore", x = "", title = "Metascore by year") +
  theme_classic() +
  scale_color_viridis_d(option = "cividis") +
  scale_fill_viridis_d(option = "cividis")
p7

Metascore by number of owners (Reference: david_craryon)

median_values <- cdf2 %>%
  group_by(owners) %>%
  summarise(
    med = median(metascore, na.rm = TRUE),
    mean = median(metascore, na.rm = TRUE)
  )
  
p8 = ggplot(cdf2, aes(x = owners, y = metascore)) +
  geom_boxplot(aes(color = owners)) +
  geom_point(data = median_values, aes(x = owners, y = med, fill = owners), shape = 23, size = 4, color = "black") +
  guides(fill = FALSE, color = FALSE) +
  coord_flip() +
  labs(y = "Metascore", x = "# of owners", title = "Metascore values by owners level") +
  theme_classic() +
  scale_color_viridis_d(option="cividis") +
  scale_fill_viridis_d(option="cividis")
p8

  • As expected, the median metascore has some positive relationship with the number of owners.

Developers

Largest developers

#games released
p9 = cdf2 %>% filter(!is.na(developer)) %>% count(developer) %>% rename(devn = n) %>% arrange(desc(devn)) %>% slice(1:10) %>%
    ggplot(aes(x=developer, y=devn) ) +
    geom_segment(aes(x=reorder(developer,devn),xend=developer, y=0, yend=devn), color="#495867") +
    geom_point(size=2, color="#495867") + scale_y_continuous(limits = c(0, 100)) +
    coord_flip() +
    theme_light() +
    theme(
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.x = element_blank()
  ) +
  xlab("") +
  ylab("") +  
  labs(title="Largest developers by games released")

#games released with >20,000 owners 
p10= cdf2 %>% filter(high_owners==1) %>% filter(!is.na(developer)) %>% 
    count(developer) %>% rename(devn = n) %>% arrange(desc(devn)) %>% slice(1:10) %>%
    ggplot( aes(x=developer, y=devn) ) +
    geom_segment( aes(x=reorder(developer,devn) ,xend=developer, y=0, yend=devn), color="#495867") +
    geom_point(size=2, color="#495867") + scale_y_continuous(limits = c(0, 100)) +
    coord_flip() +
    theme_light() +
    theme(
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.x = element_blank()
  ) +
  xlab("") +
  ylab("") +  
  labs(title="Largest developers by games released with >20,000 owners")
#plot
plot_grid(p9, p10, ncol = 1, align = "v")

  • 9 out of 10 largest developers by count of games with more than 20,000 owners are not the top 10 largest developers by count of games list.

Largest developer and distribution of owners (Reference: Christopher Yee)

# create new variable: age, min/max_owner, remove games with wrong dates, studio games with prices >150
cdf3 = cdf2 %>% 
  mutate(release_year = as.numeric(str_trim(release_year)), 
           release_year = case_when(release_year == 5 ~ 2015, # INCORRECT DATA POINT
                          TRUE ~ release_year),
           age = 2019 - release_year)  %>%
  mutate(max_owners = str_trim(word(owners, 2, sep = "\\..")),
         max_owners = as.numeric(str_replace_all(max_owners, ",", "")),
         min_owners = str_trim(word(owners, 1, sep = "\\..")),
         min_owners = as.numeric(str_replace_all(min_owners, ",", ""))) 
p11= cdf3 %>% 
  select(-number, -release_date) %>% 
  mutate(max_owners = as.factor(max_owners / 1000000)) %>% 
  group_by(developer) %>% 
  mutate(n = n()) %>% 
  ungroup() %>% 
  filter(n >= 43, developer != 'NA') %>%
  mutate(developer = paste0(developer, " (", n, ")"),
         developer= reorder(developer, (n))) %>% 
  ggplot(aes(developer, max_owners, color = developer)) +
  geom_jitter(show.legend = FALSE, size = 2.5, alpha = 0.5) +
  scale_color_brewer(palette = 'BrBG') +
  theme_light() + coord_flip() + 
  labs(x = "Developer (total games released)",
       y = "Estimated game owners per million",
       title = "Distribution of owners levels by developer") +
 theme(
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.x = element_blank(),
    plot.title = element_text(size=11)
  ) 
p11

Publishers

#games released
p12= cdf2 %>% filter(!is.na(publisher)) %>% count(publisher) %>% rename(pubn = n) %>% arrange(desc(pubn)) %>% slice(1:10) %>%
    ggplot(aes(x=publisher, y=pubn) ) +
    geom_segment(aes(x=reorder(publisher,pubn),xend=publisher, y=0, yend=pubn), color="#495867") +
    geom_point(size=2, color="#495867") + scale_y_continuous(limits = c(0, 300)) +
    coord_flip() +
    theme_light() +
    theme(
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.x = element_blank()
  ) +
  xlab("") +
  ylab("") +  
  labs(title="Largest publishers by games released")
#games released with >20,000 owners
p13= cdf2 %>% filter(high_owners==1) %>% count(publisher) %>% rename(pubn = n) %>% arrange(desc(pubn)) %>% slice(1:10) %>%
    ggplot(aes(x=publisher, y=pubn) ) +
    geom_segment(aes(x=reorder(publisher,pubn),xend=publisher, y=0, yend=pubn), color="#495867") +
    geom_point(size=2, color="#495867") + scale_y_continuous(limits = c(0, 300)) +
    coord_flip() +
    theme_light() +
    theme(
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.x = element_blank()
  ) +
  xlab("") +
  ylab("") +  
  labs(title="Largest publishers by games released with >20,000 owners")
#plot
plot_grid(p12, p13, ncol = 1, align = "v")

  • 6 out of 10 largest publishers by count of games are also in the top 10 largest publishers by count of games with more than 20,000 owners. The six publishers are SEGA, Strategy First, Ubisoft, Square Enix, IC Entertainment, THQ Nordic.