We will have a look today at the ~18000 video game reviews by IGN.com. After a quick overview, we will focus on a comparison between the main market players: Xbox, Playstation, Nintendo and PC (although most PC are Microsoft - as Xbox - there are different platforms and are treated differently).
Dataset available on kaggle
library(dplyr)
library(tidyr)
library(ggplot2)
data.source <- read.csv("C:/Users/marc/Desktop/Data/161030_ign/ign.csv", header = TRUE,sep = ",")
#data.source <- read.csv('../input/ign.csv',sep=',')
#change the date for the walking dead test (currently 1-1-1970)
#We will put the date found on "http://www.ign.com/articles/2012/04/23/the-walking-dead-the-game-episode-1-review-3"
data.cleaned <- data.source
correction <- data.cleaned$release_year == "1970"
data.cleaned$release_month[correction] <- 4
data.cleaned$release_day[correction] <- 23
data.cleaned$release_year[correction] <- "2012"
#mutate to create a new column date
data.cleaned <- mutate(data.cleaned, date= paste(
data.cleaned$release_year,
data.cleaned$release_month,
data.cleaned$release_day,
sep="/"
))
data.cleaned$date <- as.Date(data.cleaned$date,"%Y/%m/%e")
# reorder levels from score_phrase
data.cleaned$score_phrase <- factor(data.cleaned$score_phrase,
levels = c(
"Disaster",
"Unbearable",
"Painful",
"Awful",
"Bad",
"Mediocre",
"Okay",
"Good",
"Great",
"Amazing",
"Masterpiece"))
#main genre reviewed
overview.genre <- as.data.frame(xtabs(data=data.cleaned, ~ genre))
overview.genre$genre <- as.character(overview.genre$genre)
#quantile(overview.genre$Freq, prob = 0.50)
#quantile(overview.genre$Freq)
for(i in 1:length(overview.genre$Freq)) {
if(overview.genre$Freq[i]<300){
overview.genre$category[i] <- "Other"
} else {
overview.genre$category[i] <- overview.genre$genre[i]
}
}
overview.genre <- aggregate(data=overview.genre, Freq ~ category, FUN="sum")
overview.genre <- arrange(overview.genre, desc(Freq))
ggplot(overview.genre, aes(x= reorder(category,Freq), y=Freq, label=Freq))+
geom_bar(stat="identity", fill="#07617d")+
coord_flip()+
xlab("Genre")+
ylab("Number of reviews")+
theme_bw()+
ggtitle("What are the main genre reviewed?")+
geom_text(hjust=-0.5)+
scale_y_continuous(limits = c(0, 4500))
The Other category is the sum of all genre categories with less than 300 reviews (during the period 1996-2016)
First surprise, sports and racing genres are pretty high, above aventure or RPG
Action genre represents 20% of all reviews
#Editor's pick
ggplot(data.cleaned, aes(x=score_phrase,y=data.cleaned$editors_choice))+
geom_jitter(alpha=0.2, colour="#07617d")+
theme_bw()+
ggtitle("Review by score category and Editor's choice")+
xlab("Score categories")+
ylab("Editor's choice")
Most games are labelled as okay, good or great
You better be at least great if you want to be part of the editors choice
#score barchart to confirm
overview.score <- as.data.frame(xtabs(data=data.cleaned,~ score_phrase))
ggplot(overview.score, aes(x=score_phrase,y=Freq,label=Freq))+
geom_bar(stat="Identity", fill="#07617d")+
coord_flip()+
theme_bw()+
ggtitle("All reviews by score category")+
ylab("Number of reviews")+
xlab("")+
geom_text(hjust=-0.5)+
scale_y_continuous(limits = c(0, 5200))
score.year <- aggregate(data=data.cleaned, score ~ release_year, FUN= "median")
ggplot(score.year, aes(x=release_year,y=score, group=1))+
geom_point(size=5, colour="#07617d")+
geom_line(size=1.2, colour="#07617d")+
theme_bw()+
ggtitle("Median Score by year")+
xlab("Release year")+
ylab("Score")+
scale_y_continuous(limits = c(5, 10))
Scores are between 0 (Disaster) and 10 (Masterpiece)
1996 and 2008 are the worst years in term of median score
Median score increased by around 1 point in 20 years
#main platform reviewed
overview.plateform <- as.data.frame(xtabs(data=data.cleaned,
~ platform))
overview.plateform <- arrange(overview.plateform, desc(Freq))
ggplot(head(overview.plateform,10),
aes(x=reorder(platform,Freq),y=Freq, label=Freq))+
geom_bar(stat="identity",fill="#07617d")+
coord_flip()+
xlab("")+
ylab("Number of reviews")+
ggtitle("Top 10 platforms")+
theme_bw()+
geom_text(hjust=-0.5)+
scale_y_continuous(limits = c(0, 3700))
#number of review per year
overview.year <- as.data.frame(xtabs(data=data.cleaned,
~ release_year))
ggplot(overview.year, aes(x=release_year, y=Freq, group=1))+
geom_point(size=5, colour="#07617d")+
geom_line(size=1.2, colour="#07617d")+
theme_bw()+
ggtitle("Number of reviews per release year")
After a peak in 2008, the number of reviews strongly decreased. I am not sure if it is the result of IGN decreasing the number of reviews they are doing, or if the number of vdeo game titles released by year strongly decreased. I found the following chart on Quora which seem to indicate that the number of titles decreased; however, the author is not completely sure of the data accuracy.
Nevertheless, with all mobile games and indie games available on platform like Steam, it is hard to believe that the number of games decreased.
#number of review per month
overview.year.score <- as.data.frame(xtabs(data=data.cleaned,
~ release_year + score_phrase))
gradient.palette <- c("#402316","#b40100","#ee4000","#ff7d00",
"#ffb430","#f6a454", "#ffec5f","#b0ff00","#72e800",
"#5aaf4a","#4a631e")
ggplot(overview.year.score, aes(x=release_year,y=Freq, fill=score_phrase))+
geom_bar(stat="identity",position = "fill")+
theme_bw()+
scale_fill_manual(values = gradient.palette)+
ggtitle("Review per year and score category")+
xlab("Release year")
You can find below the platforms I kept for the selection. I had a doubt regarding “Web Games” which I finally didn’t take into account in the PC cateogry (as I guess they can be related to mobile phones?)
Microsoft: Xbox One, Xbox, Xbox 360
Playstation: PlayStation Portable, PlayStation 4, PlayStation 2, PlayStation 3, PlayStation Vita, PlayStation
Nintendo: Nintendo 3DS, Wii U, Nintendo 64DD, Game Boy Advance,Nintendo DSi, New Nintendo 3DS,Nintendo DS,Wii,Nintendo 64,Game Boy,NES,Game Boy Color, GameCube,Super NES
PC: Linux, Macintosh, SteamOS, PC
I labelled those players are “companies” but it is not completely true as PC is more treated as a seperate platform in this case…
Microsoft <- c("Xbox One", "Xbox", "Xbox 360")
Sony <- c("PlayStation Portable",
"PlayStation 4",
"PlayStation 2",
"PlayStation 3",
"PlayStation Vita",
"PlayStation")
PC <- c("Linux","Macintosh","SteamOS","PC")
Nintendo <- c("Nintendo 3DS",
"Wii U",
"Nintendo 64DD",
"Game Boy Advance",
"Nintendo DSi",
"New Nintendo 3DS",
"Nintendo DS",
"Wii",
"Nintendo 64",
"Game Boy",
"NES",
"Game Boy Color",
"GameCube",
"Super NES")
for(i in 1:length(data.cleaned$platform)) {
if(data.cleaned$platform[i] %in% Microsoft){
data.cleaned$company[i] <- "Microsoft"
} else if (data.cleaned$platform[i] %in% Sony){
data.cleaned$company[i] <- "Sony"
} else if (data.cleaned$platform[i] %in% PC){
data.cleaned$company[i] <- "PC"
} else if (data.cleaned$platform[i] %in% Nintendo){
data.cleaned$company[i] <- "Nintendo"
} else {
data.cleaned$company[i] <- "Other"
}
}
data.cleaned$company <- as.factor(data.cleaned$company)
my.palette <- c("Sony" = "#edc951",
"Nintendo" = "#eb6841",
"PC" = "#cc2a36",
"Microsoft" = "#4f372d",
"Other" = "#00a0b0")
#number of review per company
overview.company <- as.data.frame(xtabs(data=data.cleaned,
~ company))
ggplot(overview.company, aes(x=reorder(company,Freq),
y=Freq,
fill=company,
label=Freq))+
geom_bar(stat="identity")+
coord_flip()+
xlab("Company")+
ylab("Number of reviews")+
ggtitle("Number of reviews")+
scale_fill_manual(values = my.palette)+
theme_bw()+
theme(legend.position="none")+
geom_text(hjust=-0.5)+
scale_y_continuous(limits = c(0, 5500))
PC was previously seen as the main platform for review; however, once the consoles from the same constructor aes summed up, we can see that Sony with Playstations and Nintendo are above
Microsoft arrived later on the market with the Xbox so it is probably the reason why the number of review is quite low
Other are clearly below the other main actors
# score category by company
ggplot(data.cleaned, aes(x=score_phrase,y=company, colour=company))+
geom_jitter(alpha=0.2)+
scale_color_manual(values = my.palette)+
theme_bw()+
xlab("Score category")+
ylab("")+
theme(legend.position="none")+
ggtitle("Reviews by Score categories")
# boxplot score
ggplot(data.cleaned,aes(x=company,y=score,colour=company))+
geom_boxplot()+
scale_colour_manual(values = my.palette)+
theme_bw()+
xlab("")+
theme(legend.position="none")+
ggtitle("Overall score")
# score evolution
company.score <- aggregate(data= data.cleaned,
score ~ company + release_year,
FUN="median")
ggplot(company.score, aes(x=release_year,y=score,colour=company, group=company))+
geom_point(size=5)+
geom_line(size=1.2)+
scale_color_manual(values = my.palette)+
theme_bw()+
theme(legend.position="none")+
facet_grid(company ~ .)+
geom_point(aes(x=release_year,y=score),colour="white",size=3)+
ggtitle("Median score by year")
Nintendo median’s score is increasing since 2008, as PC and Sony
Other is fluctuating quite a lot; however, it is probably related to the various platforms over times (Ouya, mobile phone games, etc…)
Other median’s score in 2001 is really high with almost 8.5. We will see below where is it coming from.
other.2001 <- filter(data.cleaned, company == "Other" & release_year == 2001)
ggplot(other.2001, aes(x=score_phrase, y=company, colour=platform))+
geom_jitter(size=3)+
theme_bw()+
scale_colour_manual(values = c("#f2bc69","#6cff85","#ec70f4"))+
xlab("")+
ylab("")+
ggtitle("Other category in 2001")
#number of masterpiece per company
masterpiece.overview <- as.data.frame(xtabs(
data = data.cleaned[data.cleaned$score_phrase == "Masterpiece",],
~ company))
ggplot(masterpiece.overview, aes(x=reorder(company,Freq),
y=Freq,
fill=company,
label=Freq))+
geom_bar(stat="identity")+
coord_flip()+
xlab("")+
ylab("Number of reviews")+
ggtitle("Number of masterpieces per company")+
scale_fill_manual(values = my.palette)+
theme_bw()+
theme(legend.position="none")+
geom_text(hjust=-0.5)+
scale_y_continuous(limits = c(0, 20))
#when was the masterpiece produced?
masterpiece.year <- as.data.frame(xtabs(
data = data.cleaned[data.cleaned$score_phrase == "Masterpiece",],
~ company + release_year))
ggplot(masterpiece.year, aes(x=release_year,y=Freq,fill=company))+
geom_bar(stat = "identity",position = "stack")+
theme_bw()+
scale_fill_manual(values = my.palette)+
ggtitle("Games labelled as masterpieces over the years")+
xlab("Release year")+
ylab("Number of masterpiece reviews")
It is not so surprising to see that masterpieces over the last years are split between PC, Microsoft and Sony as most games are sold cross-platforms (except for Nintendo I guess)
2016 has so far not so many reviews (as we saw before); however, the number of masterpiece has (almost) never been so high. It is probably related to the previous bullet point, as 1 masterpiece game sold on 3 platforms will mean 3 masterpieces
Are the golden years of Nintendo behind?
masterpiece.genre <- as.data.frame(xtabs(
data = data.cleaned[data.cleaned$score_phrase == "Masterpiece",],
~ company + genre), drop.unused.levels = TRUE)
masterpiece.genre <- filter(masterpiece.genre, Freq != 0)
ggplot(masterpiece.genre, aes(x=genre,y=Freq, fill=company))+
geom_bar(stat = "identity",position = "stack")+
theme_bw()+
coord_flip()+
scale_fill_manual(values = my.palette)+
ggtitle("Games labelled as masterpieces per genre")+
xlab("Genre")+
ylab("Number of masterpiece reviews")+
facet_grid(~ company)+
theme(legend.position="none")
First surprise, Sony has no masterpieces in the RPG genre, despite many games like Final Fantasy, Persona, etc… on Playstation.
No shooter games!
Action / adventures is the main “masterpiece” genre