Video_Games <- read.csv("/Users/apple/Desktop/R/Video_Games.csv")
data(Video_Games)
## Warning in data(Video_Games): data set 'Video_Games' not found
#Introduction_and_Preparation
summary(Video_Games)
## Name Platform Year_of_Release Genre
## Length:16719 Length:16719 Length:16719 Length:16719
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Publisher NA_Sales EU_Sales JP_Sales
## Length:16719 Min. : 0.0000 Min. : 0.000 Min. : 0.0000
## Class :character 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.0000
## Mode :character Median : 0.0800 Median : 0.020 Median : 0.0000
## Mean : 0.2633 Mean : 0.145 Mean : 0.0776
## 3rd Qu.: 0.2400 3rd Qu.: 0.110 3rd Qu.: 0.0400
## Max. :41.3600 Max. :28.960 Max. :10.2200
##
## Other_Sales Global_Sales Critic_Score Critic_Count
## Min. : 0.00000 Min. : 0.0100 Min. :13.00 Min. : 3.00
## 1st Qu.: 0.00000 1st Qu.: 0.0600 1st Qu.:60.00 1st Qu.: 12.00
## Median : 0.01000 Median : 0.1700 Median :71.00 Median : 21.00
## Mean : 0.04733 Mean : 0.5335 Mean :68.97 Mean : 26.36
## 3rd Qu.: 0.03000 3rd Qu.: 0.4700 3rd Qu.:79.00 3rd Qu.: 36.00
## Max. :10.57000 Max. :82.5300 Max. :98.00 Max. :113.00
## NA's :8582 NA's :8582
## User_Score User_Count Developer Rating
## Length:16719 Min. : 4.0 Length:16719 Length:16719
## Class :character 1st Qu.: 10.0 Class :character Class :character
## Mode :character Median : 24.0 Mode :character Mode :character
## Mean : 162.2
## 3rd Qu.: 81.0
## Max. :10665.0
## NA's :9129
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Video_Games$Year_of_Release = as.numeric(as.character(Video_Games$Year_of_Release))
## Warning: NAs introduced by coercion
Video_Games$User_Score = as.numeric(as.character(Video_Games$User_Score))
## Warning: NAs introduced by coercion
summary(Video_Games)
## Name Platform Year_of_Release Genre
## Length:16719 Length:16719 Min. :1980 Length:16719
## Class :character Class :character 1st Qu.:2003 Class :character
## Mode :character Mode :character Median :2007 Mode :character
## Mean :2006
## 3rd Qu.:2010
## Max. :2020
## NA's :269
## Publisher NA_Sales EU_Sales JP_Sales
## Length:16719 Min. : 0.0000 Min. : 0.000 Min. : 0.0000
## Class :character 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.0000
## Mode :character Median : 0.0800 Median : 0.020 Median : 0.0000
## Mean : 0.2633 Mean : 0.145 Mean : 0.0776
## 3rd Qu.: 0.2400 3rd Qu.: 0.110 3rd Qu.: 0.0400
## Max. :41.3600 Max. :28.960 Max. :10.2200
##
## Other_Sales Global_Sales Critic_Score Critic_Count
## Min. : 0.00000 Min. : 0.0100 Min. :13.00 Min. : 3.00
## 1st Qu.: 0.00000 1st Qu.: 0.0600 1st Qu.:60.00 1st Qu.: 12.00
## Median : 0.01000 Median : 0.1700 Median :71.00 Median : 21.00
## Mean : 0.04733 Mean : 0.5335 Mean :68.97 Mean : 26.36
## 3rd Qu.: 0.03000 3rd Qu.: 0.4700 3rd Qu.:79.00 3rd Qu.: 36.00
## Max. :10.57000 Max. :82.5300 Max. :98.00 Max. :113.00
## NA's :8582 NA's :8582
## User_Score User_Count Developer Rating
## Min. :0.000 Min. : 4.0 Length:16719 Length:16719
## 1st Qu.:6.400 1st Qu.: 10.0 Class :character Class :character
## Median :7.500 Median : 24.0 Mode :character Mode :character
## Mean :7.125 Mean : 162.2
## 3rd Qu.:8.200 3rd Qu.: 81.0
## Max. :9.700 Max. :10665.0
## NA's :9129 NA's :9129
#Adding_new_column
Video_Games <- Video_Games %>%
mutate(Year = case_when(Year_of_Release >= 2010 ~ "10s"
,Year_of_Release >= 2000 ~ "00s"
,Year_of_Release >= 1990 ~ "90s"
,Year_of_Release >= 1980 ~ "80s"))
Video_Games$Year <- factor(Video_Games$Year,
levels = c("80s","90s","00s","10s"))
#Global_Sales_Histogram
ggplot()+geom_histogram(data=Video_Games,aes(x=Year_of_Release),
fill="gray",color="black",binwidth=1, na.rm = TRUE)+
labs(x="Year of Release",y="Total Amount of Copies Sold",
title="Global Sales by Year")+
theme(plot.title = element_text(hjust = 0.5))
#Top_10
Top_10 <- Video_Games %>%
filter(Global_Sales > 28.30)
dat <- data.frame(x=c(Top_10$Name),
y=c(Top_10$Global_Sales))
barplot(dat$y, names.arg=dat$x, cex.names=.5, las = 2,
main="The Top 10 Best-Selling Video Games of All Time")
#Top_20
Top_20 <- Video_Games %>%
filter(Global_Sales > 20.14)
dat <- data.frame(x=c(Top_20$Name),
y=c(Top_20$Global_Sales))
barplot(dat$y, names.arg=dat$x, cex.names=.5, las = 2,
main="The Top 20 Best-Selling Video Games of All Time")
#Top_100
Top_100 <- Video_Games %>%
filter(Global_Sales > 7.38)
ggplot(Top_100, aes(x=Platform, y=Global_Sales,
fill=Platform))+
geom_boxplot(alpha=0.6)+
labs(x="Platform", y="Total Amount of Copies Sold",
title= "The Top 100 Best-Selling Video Games by Platform")+
theme(legend.position = "none")+
theme(panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
theme(plot.title = element_text(hjust = 0.5, face="bold"))
#Excluding_number1
Excluding_number1 <- Video_Games %>%
filter(Global_Sales > 7.38, Global_Sales < 82.53)
ggplot(Excluding_number1, aes(x=Platform, y=Global_Sales,
fill=Platform))+
geom_boxplot(alpha=0.6)+
labs(x="Platform", y="Total Amount of Copies Sold",
title= "Excluding Number 1")+
theme(legend.position = "none")+
theme(panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
theme(plot.title = element_text(hjust = 0.5, face="bold"))
#Main_Platforms
Main_Platforms <- Excluding_number1 %>%
filter(Platform %in% c("DS","GB","NES","PS2","Wii","X360"))
ggplot(Main_Platforms, aes(x=Platform, y=Global_Sales,
fill=Platform))+
geom_boxplot(alpha=0.6)+
labs(x="Platform", y="Total Amount of Copies Sold",
title= "Global Sales by Platform")+
theme(legend.position = "none")+
theme(panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
theme(plot.title = element_text(hjust = 0.5, face="bold"))
#Rating
ggplot(Main_Platforms, aes(x=Rating, y=Global_Sales,
fill=Rating))+
geom_boxplot(alpha=0.6)+
labs(x="Rating", y="Total Amount of Copies Sold",
title= "Global Sales by Rating")+
theme(legend.position = "none")+
theme(panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
theme(plot.title = element_text(hjust = 0.5, face="bold"))
#Genre
ggplot(Main_Platforms, aes(x=Genre, y=Global_Sales,
fill=Genre))+
geom_boxplot(alpha=0.6)+
labs(x="Genre", y="Total Amount of Copies Sold",
title= "Global Sales by Genre")+
theme(legend.position = "none")+
theme(panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
theme(plot.title = element_text(hjust = 0.5, face="bold"))
#pairs_and_correlation_by_Region
pairs(Main_Platforms[,c(3,6:10)])
cor(Main_Platforms[,c(3, 6:10)],use= "complete.obs")
## Year_of_Release NA_Sales EU_Sales JP_Sales Other_Sales
## Year_of_Release 1.0000000 -0.50149180 0.1211181 -0.35589108 0.11268879
## NA_Sales -0.5014918 1.00000000 0.2540051 0.21467092 0.07540978
## EU_Sales 0.1211181 0.25400505 1.0000000 0.42476773 0.13167708
## JP_Sales -0.3558911 0.21467092 0.4247677 1.00000000 -0.05647155
## Other_Sales 0.1126888 0.07540978 0.1316771 -0.05647155 1.00000000
## Global_Sales -0.3715746 0.83168126 0.6616454 0.56246632 0.28107735
## Global_Sales
## Year_of_Release -0.3715746
## NA_Sales 0.8316813
## EU_Sales 0.6616454
## JP_Sales 0.5624663
## Other_Sales 0.2810773
## Global_Sales 1.0000000
#pairs_and_correlation_by_Score
pairs(Main_Platforms[,c(3,10,11,13)])
cor(Main_Platforms[,c(3,10,11,13)],use= "complete.obs")
## Year_of_Release Global_Sales Critic_Score User_Score
## Year_of_Release 1.0000000 -0.10614012 -0.2156673 -0.51529857
## Global_Sales -0.1061401 1.00000000 -0.1731977 0.07777366
## Critic_Score -0.2156673 -0.17319774 1.0000000 0.44337811
## User_Score -0.5152986 0.07777366 0.4433781 1.00000000
#Global_Sales_by_Platform
ggplot(Main_Platforms, aes(x=Year, y=Global_Sales))+
labs(x="Year of Release", y="Total Amount of Copies Sold",
title= "Global Sales of All Time by Platform")+
geom_point()+
facet_wrap(~Platform)+
theme(plot.title = element_text(hjust = 0.5, face="bold"))
#North_America_Sales_by_Platform
ggplot(Main_Platforms, aes(x=Year, y=NA_Sales))+
labs(x="Year of Release", y="Total Amount of Copies Sold",
title= "North America Sales of All Time by Platform")+
geom_point()+
facet_wrap(~Platform)+
theme(plot.title = element_text(hjust = 0.5, face="bold"))
#Europe_Sales_by_Platform
ggplot(Main_Platforms, aes(x=Year, y=EU_Sales))+
labs(x="Year of Release", y="Total Amount of Copies Sold",
title= "Europe Sales of All Time by Platform")+
geom_point()+
facet_wrap(~Platform)+
theme(plot.title = element_text(hjust = 0.5, face="bold"))
#Japan_Sales_by_Platform
ggplot(Main_Platforms, aes(x=Year, y=JP_Sales))+
labs(x="Year of Release", y="Total Amount of Copies Sold",
title= "Japan Sales of All Time by Platform")+
geom_point()+
facet_wrap(~Platform)+
theme(plot.title = element_text(hjust = 0.5, face="bold"))
#Other_Regions_Sales_by_Platform
ggplot(Main_Platforms, aes(x=Year, y=Other_Sales))+
labs(x="Year of Release", y="Total Amount of Copies Sold",
title= "Other Regions Sales of All Time by Platform")+
geom_point()+
facet_wrap(~Platform)+
theme(plot.title = element_text(hjust = 0.5, face="bold"))
#Coefficient_of_variation_Global
stdev <- sd(Main_Platforms$Global_Sales, na.rm = TRUE)
stdev
## [1] 8.230985
avg <- mean(Main_Platforms$Global_Sales, na.rm = TRUE)
avg
## [1] 16.36579
Coefvar <- (stdev/avg)*100
Coefvar
## [1] 50.29384
#Coefficient_of_variation_North_America
stdev_NA <- sd(Main_Platforms$NA_Sales, na.rm = TRUE)
stdev_NA
## [1] 5.499128
avg_NA <- mean(Main_Platforms$NA_Sales, na.rm = TRUE)
avg_NA
## [1] 8.288772
Coefvar_NA <- (stdev_NA/avg_NA)*100
Coefvar_NA
## [1] 66.3443
ggplot()+
geom_line(data = Main_Platforms,
mapping = aes(x=Year_of_Release, y=Global_Sales),
color="Black")+
geom_point(data = Main_Platforms,
mapping = aes(x=Year_of_Release, y=Global_Sales),
color="Black")+
geom_line(data = Main_Platforms,
mapping = aes(x=Year_of_Release, y=NA_Sales),
color="Blue")+
geom_point(data = Main_Platforms,
mapping = aes(x=Year_of_Release, y=NA_Sales),
color="Blue")+
labs(x="",y="Total Amount of Copies Sold",
title="Global vs North America Sales")+
theme(plot.title = element_text(hjust = 0.5, face="bold"))
#Coefficient_of_variation_Europe
stdev_EU <- sd(Main_Platforms$EU_Sales, na.rm = TRUE)
stdev_EU
## [1] 2.83954
avg_EU <- mean(Main_Platforms$EU_Sales, na.rm = TRUE)
avg_EU
## [1] 4.283158
Coefvar_EU <- (stdev_EU/avg_EU)*100
Coefvar_EU
## [1] 66.29549
ggplot()+
geom_line(data = Main_Platforms,
mapping = aes(x=Year_of_Release, y=Global_Sales),
color="Black")+
geom_point(data = Main_Platforms,
mapping = aes(x=Year_of_Release, y=Global_Sales),
color="Black")+
geom_line(data = Main_Platforms,
mapping = aes(x=Year_of_Release, y=EU_Sales),
color="Green")+
geom_point(data = Main_Platforms,
mapping = aes(x=Year_of_Release, y=EU_Sales),
color="Green")+
labs(x="",y="Total Amount of Copies Sold",
title="Global vs Europe Sales")+
theme(plot.title = element_text(hjust = 0.5, face="bold"))
#Coefficient_of_variation_Japan
stdev_JP <- sd(Main_Platforms$JP_Sales, na.rm = TRUE)
stdev_JP
## [1] 2.335691
avg_JP <- mean(Main_Platforms$JP_Sales, na.rm = TRUE)
avg_JP
## [1] 2.350877
Coefvar_JP <- (stdev_JP/avg_JP)*100
Coefvar_JP
## [1] 99.35404
ggplot()+
geom_line(data = Main_Platforms,
mapping = aes(x=Year_of_Release, y=Global_Sales),
color="Black")+
geom_point(data = Main_Platforms,
mapping = aes(x=Year_of_Release, y=Global_Sales),
color="Black")+
geom_line(data = Main_Platforms,
mapping = aes(x=Year_of_Release, y=JP_Sales),
color="Red")+
geom_point(data = Main_Platforms,
mapping = aes(x=Year_of_Release, y=JP_Sales),
color="Red")+
labs(x="",y="Total Amount of Copies Sold",
title="Global vs Japan Sales")+
theme(plot.title = element_text(hjust = 0.5, face="bold"))
#Coefficient_of_variation_Other_Regions
stdev_Other <- sd(Main_Platforms$Other_Sales, na.rm = TRUE)
stdev_Other
## [1] 1.656475
avg_Other <- mean(Main_Platforms$Other_Sales, na.rm = TRUE)
avg_Other
## [1] 1.44193
Coefvar_Other <- (stdev_Other/avg_Other)*100
Coefvar_Other
## [1] 114.879
ggplot()+
geom_line(data = Main_Platforms,
mapping = aes(x=Year_of_Release, y=Global_Sales),
color="Black")+
geom_point(data = Main_Platforms,
mapping = aes(x=Year_of_Release, y=Global_Sales),
color="Black")+
geom_line(data = Main_Platforms,
mapping = aes(x=Year_of_Release, y=Other_Sales),
color="Yellow")+
geom_point(data = Main_Platforms,
mapping = aes(x=Year_of_Release, y=Other_Sales),
color="Yellow")+
labs(x="",y="Total Amount of Copies Sold",
title="Global vs Other Regions Sales")+
theme(plot.title = element_text(hjust = 0.5, face="bold"))
“North America” region is the biggest market and most of the time has had a big level of contribution to the “global”.
While “global” sales have kept increasing through the years, “Japan” and “Other regions” sales have kept a low level of contribution to the “global”.
As expected, in this industry technology and innovation is the most important factor. Every decade has been dominated for “one” or “two” platforms (consoles), making sales of old consoles immaterial to the “global”.
Independently of genre, platform, rating or decade, unexpectedly a video game can become an “outlier” making millions of sales, even if similar games in the past could not get that level of sales.
“Minecraft” with 238 millions of copies sold is not included in this dataset.
“Multi-platform”, “PC” and “Mobile” category of platforms are not included in this dataset.
“Japan” population is smaller than “North America” and “Europe”, so it is logical that contribution level is lower too.