Data source:
Load the Data Requirement
library(tidyverse)
library(ggthemes)
library(ggplot2)
library(reshape2)
Introduction
This dataset contains a list of video games with sales greater than 100,000 copies. It was generated by a scrape of vgchartz.com.
Fields include
Rank - Ranking of overall sales
Name - The games name
Platform - Platform of the games release (i.e. PC,PS4, etc.)
Year - Year of the game’s release
Genre - Genre of the game
Publisher - Publisher of the game
NA_Sales - Sales in North America (in millions)
EU_Sales - Sales in Europe (in millions)
JP_Sales - Sales in Japan (in millions)
Other_Sales - Sales in the rest of the world (in millions)
Global_Sales - Total worldwide sales.
The script to scrape the data is available at https://github.com/GregorUT/vgchartzScrape. It is based on BeautifulSoup using Python. There are 16,598 records. 2 records were dropped due to incomplete information.
Read Data
game <- read.csv("vgsales.csv")
summary(game)
## Rank Name Platform Year
## Min. : 1 Length:16598 Length:16598 Length:16598
## 1st Qu.: 4151 Class :character Class :character Class :character
## Median : 8300 Mode :character Mode :character Mode :character
## Mean : 8301
## 3rd Qu.:12450
## Max. :16600
## Genre Publisher NA_Sales EU_Sales
## Length:16598 Length:16598 Min. : 0.0000 Min. : 0.0000
## Class :character Class :character 1st Qu.: 0.0000 1st Qu.: 0.0000
## Mode :character Mode :character Median : 0.0800 Median : 0.0200
## Mean : 0.2647 Mean : 0.1467
## 3rd Qu.: 0.2400 3rd Qu.: 0.1100
## Max. :41.4900 Max. :29.0200
## JP_Sales Other_Sales Global_Sales
## Min. : 0.00000 Min. : 0.00000 Min. : 0.0100
## 1st Qu.: 0.00000 1st Qu.: 0.00000 1st Qu.: 0.0600
## Median : 0.00000 Median : 0.01000 Median : 0.1700
## Mean : 0.07778 Mean : 0.04806 Mean : 0.5374
## 3rd Qu.: 0.04000 3rd Qu.: 0.04000 3rd Qu.: 0.4700
## Max. :10.22000 Max. :10.57000 Max. :82.7400
Data Pre-Processing
Check data
head(game)
## Rank Name Platform Year Genre Publisher NA_Sales
## 1 1 Wii Sports Wii 2006 Sports Nintendo 41.49
## 2 2 Super Mario Bros. NES 1985 Platform Nintendo 29.08
## 3 3 Mario Kart Wii Wii 2008 Racing Nintendo 15.85
## 4 4 Wii Sports Resort Wii 2009 Sports Nintendo 15.75
## 5 5 Pokemon Red/Pokemon Blue GB 1996 Role-Playing Nintendo 11.27
## 6 6 Tetris GB 1989 Puzzle Nintendo 23.20
## EU_Sales JP_Sales Other_Sales Global_Sales
## 1 29.02 3.77 8.46 82.74
## 2 3.58 6.81 0.77 40.24
## 3 12.88 3.79 3.31 35.82
## 4 11.01 3.28 2.96 33.00
## 5 8.89 10.22 1.00 31.37
## 6 2.26 4.22 0.58 30.26
glimpse(game)
## Rows: 16,598
## Columns: 11
## $ Rank <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17~
## $ Name <chr> "Wii Sports", "Super Mario Bros.", "Mario Kart Wii", "Wii~
## $ Platform <chr> "Wii", "NES", "Wii", "Wii", "GB", "GB", "DS", "Wii", "Wii~
## $ Year <chr> "2006", "1985", "2008", "2009", "1996", "1989", "2006", "~
## $ Genre <chr> "Sports", "Platform", "Racing", "Sports", "Role-Playing",~
## $ Publisher <chr> "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Nintendo~
## $ NA_Sales <dbl> 41.49, 29.08, 15.85, 15.75, 11.27, 23.20, 11.38, 14.03, 1~
## $ EU_Sales <dbl> 29.02, 3.58, 12.88, 11.01, 8.89, 2.26, 9.23, 9.20, 7.06, ~
## $ JP_Sales <dbl> 3.77, 6.81, 3.79, 3.28, 10.22, 4.22, 6.50, 2.93, 4.70, 0.~
## $ Other_Sales <dbl> 8.46, 0.77, 3.31, 2.96, 1.00, 0.58, 2.90, 2.85, 2.26, 0.4~
## $ Global_Sales <dbl> 82.74, 40.24, 35.82, 33.00, 31.37, 30.26, 30.01, 29.02, 2~
summary(game)
## Rank Name Platform Year
## Min. : 1 Length:16598 Length:16598 Length:16598
## 1st Qu.: 4151 Class :character Class :character Class :character
## Median : 8300 Mode :character Mode :character Mode :character
## Mean : 8301
## 3rd Qu.:12450
## Max. :16600
## Genre Publisher NA_Sales EU_Sales
## Length:16598 Length:16598 Min. : 0.0000 Min. : 0.0000
## Class :character Class :character 1st Qu.: 0.0000 1st Qu.: 0.0000
## Mode :character Mode :character Median : 0.0800 Median : 0.0200
## Mean : 0.2647 Mean : 0.1467
## 3rd Qu.: 0.2400 3rd Qu.: 0.1100
## Max. :41.4900 Max. :29.0200
## JP_Sales Other_Sales Global_Sales
## Min. : 0.00000 Min. : 0.00000 Min. : 0.0100
## 1st Qu.: 0.00000 1st Qu.: 0.00000 1st Qu.: 0.0600
## Median : 0.00000 Median : 0.01000 Median : 0.1700
## Mean : 0.07778 Mean : 0.04806 Mean : 0.5374
## 3rd Qu.: 0.04000 3rd Qu.: 0.04000 3rd Qu.: 0.4700
## Max. :10.22000 Max. :10.57000 Max. :82.7400
Check missing value
colSums(is.na(game))
## Rank Name Platform Year Genre Publisher
## 0 0 0 0 0 0
## NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales
## 0 0 0 0 0
unique(game$Year)
## [1] "2006" "1985" "2008" "2009" "1996" "1989" "1984" "2005" "1999" "2007"
## [11] "2010" "2013" "2004" "1990" "1988" "2002" "2001" "2011" "1998" "2015"
## [21] "2012" "2014" "1992" "1997" "1993" "1994" "1982" "2003" "1986" "2000"
## [31] "N/A" "1995" "2016" "1991" "1981" "1987" "1980" "1983" "2020" "2017"
game_clean <- game[game$Year!='N/A',]
Wrangling data frame
game_clean <- game_clean %>%
mutate(Year = as.factor(Year))
class(game_clean$Year)
## [1] "factor"
EDA (Exploratory Data Analysis)
Gaming Platform
plataformFreq <-
as.data.frame(sort(table(game$Platform), decreasing = TRUE))
ggplot(plataformFreq, aes(x = Var1, y = Freq, fill = Var1)) +
ggtitle("Barplot of Platform") +
xlab("Platform") +
ylab("Frequency") +
geom_bar(stat = "identity")
Global sales per year
ggplot(subset(game, Platform %in% c("PS", "PS2", "PS3", "PS4")),
aes(x = Year, y = Global_Sales, fill = Platform)) +
ggtitle("Barplot of playstation global sales per year") +
geom_bar(stat = "identity") +
labs(x = "Year", y = "Global Sales")
Famouse gaming genre
genreFreq <-
as.data.frame(sort(table(game$Genre), decreasing = TRUE))
ggplot(genreFreq, aes(x = Var1, y = Freq, fill = Var1)) +
ggtitle("Barplot of Genre") +
xlab("Genre") +
ylab("Frequency") +
geom_bar(stat = "identity")
Top-5 Publisher Distribution by Yearly
publisher_count <- game %>%
group_by(Publisher) %>%
summarise(
GlobalSales = sum(Global_Sales),
count_game = length(unique(Name)),
.groups = 'drop'
) %>%
arrange(desc(count_game)) %>%
select(Publisher) %>%
head(5)
publisher_count20 <- as.vector(publisher_count$Publisher)
publisher_bubble <- game %>%
filter(Publisher %in% publisher_count20) %>%
group_by(Year, Publisher) %>%
summarise(
GlobalSales = sum(Global_Sales),
count_game = length(unique(Name)),
.groups = 'drop'
) %>%
arrange(desc(Year))
options(repr.plot.width = 16, repr.plot.height = 8)
ggplot(publisher_bubble,
aes(
x = Year,
y = GlobalSales,
size = count_game,
fill = Publisher
)) +
geom_point(alpha = 0.5,
shape = 21,
color = "black") +
scale_size(range = c(.1, 24), name = "Number of Games") +
theme_stata() +
ggtitle("Top-5 Publisher Distribution by Yearly Number of Game and Sales") +
ylab("in millions") +
xlab("Year") +
theme(legend.position = "right",
axis.text.x = element_text(
angle = 90,
vjust = 0.5,
hjust = 1
))