Data Visualization

Harnsen Bahari Putra

2022-01-02

Load the Data Requirement

library(tidyverse)
library(ggthemes)
library(ggplot2)
library(reshape2)

Introduction

This dataset contains a list of video games with sales greater than 100,000 copies. It was generated by a scrape of vgchartz.com.

Fields include

Rank - Ranking of overall sales

Name - The games name

Platform - Platform of the games release (i.e. PC,PS4, etc.)

Year - Year of the game’s release

Genre - Genre of the game

Publisher - Publisher of the game

NA_Sales - Sales in North America (in millions)

EU_Sales - Sales in Europe (in millions)

JP_Sales - Sales in Japan (in millions)

Other_Sales - Sales in the rest of the world (in millions)

Global_Sales - Total worldwide sales.

The script to scrape the data is available at https://github.com/GregorUT/vgchartzScrape. It is based on BeautifulSoup using Python. There are 16,598 records. 2 records were dropped due to incomplete information.

Read Data

game <- read.csv("vgsales.csv")
summary(game)
##       Rank           Name             Platform             Year          
##  Min.   :    1   Length:16598       Length:16598       Length:16598      
##  1st Qu.: 4151   Class :character   Class :character   Class :character  
##  Median : 8300   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 8301                                                           
##  3rd Qu.:12450                                                           
##  Max.   :16600                                                           
##     Genre            Publisher            NA_Sales          EU_Sales      
##  Length:16598       Length:16598       Min.   : 0.0000   Min.   : 0.0000  
##  Class :character   Class :character   1st Qu.: 0.0000   1st Qu.: 0.0000  
##  Mode  :character   Mode  :character   Median : 0.0800   Median : 0.0200  
##                                        Mean   : 0.2647   Mean   : 0.1467  
##                                        3rd Qu.: 0.2400   3rd Qu.: 0.1100  
##                                        Max.   :41.4900   Max.   :29.0200  
##     JP_Sales         Other_Sales        Global_Sales    
##  Min.   : 0.00000   Min.   : 0.00000   Min.   : 0.0100  
##  1st Qu.: 0.00000   1st Qu.: 0.00000   1st Qu.: 0.0600  
##  Median : 0.00000   Median : 0.01000   Median : 0.1700  
##  Mean   : 0.07778   Mean   : 0.04806   Mean   : 0.5374  
##  3rd Qu.: 0.04000   3rd Qu.: 0.04000   3rd Qu.: 0.4700  
##  Max.   :10.22000   Max.   :10.57000   Max.   :82.7400

Data Pre-Processing

Check data

head(game)
##   Rank                     Name Platform Year        Genre Publisher NA_Sales
## 1    1               Wii Sports      Wii 2006       Sports  Nintendo    41.49
## 2    2        Super Mario Bros.      NES 1985     Platform  Nintendo    29.08
## 3    3           Mario Kart Wii      Wii 2008       Racing  Nintendo    15.85
## 4    4        Wii Sports Resort      Wii 2009       Sports  Nintendo    15.75
## 5    5 Pokemon Red/Pokemon Blue       GB 1996 Role-Playing  Nintendo    11.27
## 6    6                   Tetris       GB 1989       Puzzle  Nintendo    23.20
##   EU_Sales JP_Sales Other_Sales Global_Sales
## 1    29.02     3.77        8.46        82.74
## 2     3.58     6.81        0.77        40.24
## 3    12.88     3.79        3.31        35.82
## 4    11.01     3.28        2.96        33.00
## 5     8.89    10.22        1.00        31.37
## 6     2.26     4.22        0.58        30.26
glimpse(game)
## Rows: 16,598
## Columns: 11
## $ Rank         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17~
## $ Name         <chr> "Wii Sports", "Super Mario Bros.", "Mario Kart Wii", "Wii~
## $ Platform     <chr> "Wii", "NES", "Wii", "Wii", "GB", "GB", "DS", "Wii", "Wii~
## $ Year         <chr> "2006", "1985", "2008", "2009", "1996", "1989", "2006", "~
## $ Genre        <chr> "Sports", "Platform", "Racing", "Sports", "Role-Playing",~
## $ Publisher    <chr> "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Nintendo~
## $ NA_Sales     <dbl> 41.49, 29.08, 15.85, 15.75, 11.27, 23.20, 11.38, 14.03, 1~
## $ EU_Sales     <dbl> 29.02, 3.58, 12.88, 11.01, 8.89, 2.26, 9.23, 9.20, 7.06, ~
## $ JP_Sales     <dbl> 3.77, 6.81, 3.79, 3.28, 10.22, 4.22, 6.50, 2.93, 4.70, 0.~
## $ Other_Sales  <dbl> 8.46, 0.77, 3.31, 2.96, 1.00, 0.58, 2.90, 2.85, 2.26, 0.4~
## $ Global_Sales <dbl> 82.74, 40.24, 35.82, 33.00, 31.37, 30.26, 30.01, 29.02, 2~
summary(game)
##       Rank           Name             Platform             Year          
##  Min.   :    1   Length:16598       Length:16598       Length:16598      
##  1st Qu.: 4151   Class :character   Class :character   Class :character  
##  Median : 8300   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 8301                                                           
##  3rd Qu.:12450                                                           
##  Max.   :16600                                                           
##     Genre            Publisher            NA_Sales          EU_Sales      
##  Length:16598       Length:16598       Min.   : 0.0000   Min.   : 0.0000  
##  Class :character   Class :character   1st Qu.: 0.0000   1st Qu.: 0.0000  
##  Mode  :character   Mode  :character   Median : 0.0800   Median : 0.0200  
##                                        Mean   : 0.2647   Mean   : 0.1467  
##                                        3rd Qu.: 0.2400   3rd Qu.: 0.1100  
##                                        Max.   :41.4900   Max.   :29.0200  
##     JP_Sales         Other_Sales        Global_Sales    
##  Min.   : 0.00000   Min.   : 0.00000   Min.   : 0.0100  
##  1st Qu.: 0.00000   1st Qu.: 0.00000   1st Qu.: 0.0600  
##  Median : 0.00000   Median : 0.01000   Median : 0.1700  
##  Mean   : 0.07778   Mean   : 0.04806   Mean   : 0.5374  
##  3rd Qu.: 0.04000   3rd Qu.: 0.04000   3rd Qu.: 0.4700  
##  Max.   :10.22000   Max.   :10.57000   Max.   :82.7400

Check missing value

colSums(is.na(game))
##         Rank         Name     Platform         Year        Genre    Publisher 
##            0            0            0            0            0            0 
##     NA_Sales     EU_Sales     JP_Sales  Other_Sales Global_Sales 
##            0            0            0            0            0
unique(game$Year)
##  [1] "2006" "1985" "2008" "2009" "1996" "1989" "1984" "2005" "1999" "2007"
## [11] "2010" "2013" "2004" "1990" "1988" "2002" "2001" "2011" "1998" "2015"
## [21] "2012" "2014" "1992" "1997" "1993" "1994" "1982" "2003" "1986" "2000"
## [31] "N/A"  "1995" "2016" "1991" "1981" "1987" "1980" "1983" "2020" "2017"
game_clean <- game[game$Year!='N/A',]

Wrangling data frame

game_clean <- game_clean %>%
  mutate(Year = as.factor(Year))
class(game_clean$Year)
## [1] "factor"

EDA (Exploratory Data Analysis)

Gaming Platform

plataformFreq <-
  as.data.frame(sort(table(game$Platform), decreasing = TRUE))
ggplot(plataformFreq, aes(x = Var1, y = Freq, fill = Var1)) +
  ggtitle("Barplot of Platform") +
  xlab("Platform") +
  ylab("Frequency") +
  geom_bar(stat = "identity")

Global sales per year

ggplot(subset(game, Platform %in% c("PS", "PS2", "PS3", "PS4")),
       aes(x = Year, y = Global_Sales, fill = Platform)) +
  ggtitle("Barplot of playstation global sales per year") +
  geom_bar(stat = "identity") +
  labs(x = "Year", y = "Global Sales")

Famouse gaming genre

genreFreq <-
  as.data.frame(sort(table(game$Genre), decreasing = TRUE))
ggplot(genreFreq, aes(x = Var1, y = Freq, fill = Var1)) +
  ggtitle("Barplot of Genre") +
  xlab("Genre") +
  ylab("Frequency") +
  geom_bar(stat = "identity")

Top-5 Publisher Distribution by Yearly

publisher_count <- game %>%
  group_by(Publisher) %>%
  summarise(
    GlobalSales = sum(Global_Sales),
    count_game = length(unique(Name)),
    .groups = 'drop'
  ) %>%
  arrange(desc(count_game)) %>%
  select(Publisher) %>%
  head(5)
publisher_count20 <- as.vector(publisher_count$Publisher)

publisher_bubble <- game %>%
  filter(Publisher %in% publisher_count20) %>%
  group_by(Year, Publisher) %>%
  summarise(
    GlobalSales = sum(Global_Sales),
    count_game = length(unique(Name)),
    .groups = 'drop'
  ) %>%
  arrange(desc(Year))


options(repr.plot.width = 16, repr.plot.height = 8)
ggplot(publisher_bubble,
       aes(
         x = Year,
         y = GlobalSales,
         size = count_game,
         fill = Publisher
       )) +
  geom_point(alpha = 0.5,
             shape = 21,
             color = "black") +
  scale_size(range = c(.1, 24), name = "Number of Games") +
  theme_stata() +
  ggtitle("Top-5 Publisher Distribution by Yearly Number of Game and Sales") +
  ylab("in millions") +
  xlab("Year") +
  theme(legend.position = "right",
        axis.text.x = element_text(
          angle = 90,
          vjust = 0.5,
          hjust = 1
        ))