Video_Games <- read.csv("/Users/apple/Desktop/R/Video_Games.csv")
data(Video_Games)
## Warning in data(Video_Games): data set 'Video_Games' not found
#Introduction_and_Preparation

summary(Video_Games)
##      Name             Platform         Year_of_Release       Genre          
##  Length:16719       Length:16719       Length:16719       Length:16719      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   Publisher            NA_Sales          EU_Sales         JP_Sales      
##  Length:16719       Min.   : 0.0000   Min.   : 0.000   Min.   : 0.0000  
##  Class :character   1st Qu.: 0.0000   1st Qu.: 0.000   1st Qu.: 0.0000  
##  Mode  :character   Median : 0.0800   Median : 0.020   Median : 0.0000  
##                     Mean   : 0.2633   Mean   : 0.145   Mean   : 0.0776  
##                     3rd Qu.: 0.2400   3rd Qu.: 0.110   3rd Qu.: 0.0400  
##                     Max.   :41.3600   Max.   :28.960   Max.   :10.2200  
##                                                                         
##   Other_Sales        Global_Sales      Critic_Score    Critic_Count   
##  Min.   : 0.00000   Min.   : 0.0100   Min.   :13.00   Min.   :  3.00  
##  1st Qu.: 0.00000   1st Qu.: 0.0600   1st Qu.:60.00   1st Qu.: 12.00  
##  Median : 0.01000   Median : 0.1700   Median :71.00   Median : 21.00  
##  Mean   : 0.04733   Mean   : 0.5335   Mean   :68.97   Mean   : 26.36  
##  3rd Qu.: 0.03000   3rd Qu.: 0.4700   3rd Qu.:79.00   3rd Qu.: 36.00  
##  Max.   :10.57000   Max.   :82.5300   Max.   :98.00   Max.   :113.00  
##                                       NA's   :8582    NA's   :8582    
##   User_Score          User_Count       Developer            Rating         
##  Length:16719       Min.   :    4.0   Length:16719       Length:16719      
##  Class :character   1st Qu.:   10.0   Class :character   Class :character  
##  Mode  :character   Median :   24.0   Mode  :character   Mode  :character  
##                     Mean   :  162.2                                        
##                     3rd Qu.:   81.0                                        
##                     Max.   :10665.0                                        
##                     NA's   :9129
library(ggplot2)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
Video_Games$Year_of_Release = as.numeric(as.character(Video_Games$Year_of_Release)) 
## Warning: NAs introduced by coercion
Video_Games$User_Score = as.numeric(as.character(Video_Games$User_Score))
## Warning: NAs introduced by coercion
summary(Video_Games)
##      Name             Platform         Year_of_Release    Genre          
##  Length:16719       Length:16719       Min.   :1980    Length:16719      
##  Class :character   Class :character   1st Qu.:2003    Class :character  
##  Mode  :character   Mode  :character   Median :2007    Mode  :character  
##                                        Mean   :2006                      
##                                        3rd Qu.:2010                      
##                                        Max.   :2020                      
##                                        NA's   :269                       
##   Publisher            NA_Sales          EU_Sales         JP_Sales      
##  Length:16719       Min.   : 0.0000   Min.   : 0.000   Min.   : 0.0000  
##  Class :character   1st Qu.: 0.0000   1st Qu.: 0.000   1st Qu.: 0.0000  
##  Mode  :character   Median : 0.0800   Median : 0.020   Median : 0.0000  
##                     Mean   : 0.2633   Mean   : 0.145   Mean   : 0.0776  
##                     3rd Qu.: 0.2400   3rd Qu.: 0.110   3rd Qu.: 0.0400  
##                     Max.   :41.3600   Max.   :28.960   Max.   :10.2200  
##                                                                         
##   Other_Sales        Global_Sales      Critic_Score    Critic_Count   
##  Min.   : 0.00000   Min.   : 0.0100   Min.   :13.00   Min.   :  3.00  
##  1st Qu.: 0.00000   1st Qu.: 0.0600   1st Qu.:60.00   1st Qu.: 12.00  
##  Median : 0.01000   Median : 0.1700   Median :71.00   Median : 21.00  
##  Mean   : 0.04733   Mean   : 0.5335   Mean   :68.97   Mean   : 26.36  
##  3rd Qu.: 0.03000   3rd Qu.: 0.4700   3rd Qu.:79.00   3rd Qu.: 36.00  
##  Max.   :10.57000   Max.   :82.5300   Max.   :98.00   Max.   :113.00  
##                                       NA's   :8582    NA's   :8582    
##    User_Score      User_Count       Developer            Rating         
##  Min.   :0.000   Min.   :    4.0   Length:16719       Length:16719      
##  1st Qu.:6.400   1st Qu.:   10.0   Class :character   Class :character  
##  Median :7.500   Median :   24.0   Mode  :character   Mode  :character  
##  Mean   :7.125   Mean   :  162.2                                        
##  3rd Qu.:8.200   3rd Qu.:   81.0                                        
##  Max.   :9.700   Max.   :10665.0                                        
##  NA's   :9129    NA's   :9129

In the first stage, we imported this dataset named “Video Games Sales 2022” that was downloaded from kaggle. At first glance, in summary we noticed that the column 3 “Year_of_Release” and 13 “User_Score” are character type so we turned into numeric type for analysis purposes. Also, we installed needed libraries for this project. Hence, our data is almost ready to start to make some graphs.

#Adding_new_column

Video_Games <- Video_Games %>% 
  mutate(Year = case_when(Year_of_Release >= 2010 ~ "10s"
                          ,Year_of_Release >= 2000 ~ "00s"
                          ,Year_of_Release >= 1990 ~ "90s"
                          ,Year_of_Release >= 1980 ~ "80s"))

Video_Games$Year <- factor(Video_Games$Year, 
                           levels = c("80s","90s","00s","10s"))

We proceeded adding a new category “Year”, classifying by the period that those games were released “80s”,“90s”,“00s”(noughties) and “10s”(The Tens). Also, we created a list in the order we wanted to avoid problems when sorting by year.

#Global_Sales_Histogram

ggplot()+geom_histogram(data=Video_Games,aes(x=Year_of_Release),
                        fill="gray",color="black",binwidth=1, na.rm = TRUE)+
  labs(x="Year of Release",y="Total Amount of Copies Sold",
       title="Global Sales by Year")+ 
  theme(plot.title = element_text(hjust = 0.5))

We created a Histogram with our data to get a general perspective, realizing that from 1990 to 2010, overall sales increased exponentially.

#Top_10

Top_10 <- Video_Games %>%
  filter(Global_Sales > 28.30)

dat <- data.frame(x=c(Top_10$Name), 
                  y=c(Top_10$Global_Sales))

barplot(dat$y, names.arg=dat$x, cex.names=.5, las = 2,
        main="The Top 10 Best-Selling Video Games of All Time")

Later, we wanted to know top best-selling video games, so we created a “Top 10” and found out many famous video games such as: Wii Sports with over 82 Million copies sold or Super Mario Bros. with 40.

#Top_20

Top_20 <- Video_Games %>%
  filter(Global_Sales > 20.14)

dat <- data.frame(x=c(Top_20$Name), 
                  y=c(Top_20$Global_Sales))

barplot(dat$y, names.arg=dat$x, cex.names=.5, las = 2,
        main="The Top 20 Best-Selling Video Games of All Time")

After, we made a “Top 20” to find other well-known video games worldwide, in this case we recognized Grand Theft Auto and Pokemon series.

#Top_100

Top_100 <- Video_Games %>%
  filter(Global_Sales > 7.38)

ggplot(Top_100, aes(x=Platform, y=Global_Sales, 
                     fill=Platform))+
  geom_boxplot(alpha=0.6)+
  labs(x="Platform", y="Total Amount of Copies Sold",
       title= "The Top 100 Best-Selling Video Games by Platform")+
  theme(legend.position = "none")+
  theme(panel.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+ 
  theme(plot.title = element_text(hjust = 0.5, face="bold"))

Since the “Top 100” represents over 90% of total sales, we made a boxplot with this top and researched which platforms got a huge part of those sales.

#Excluding_number1

Excluding_number1 <- Video_Games %>%
  filter(Global_Sales > 7.38, Global_Sales < 82.53)

ggplot(Excluding_number1, aes(x=Platform, y=Global_Sales, 
                    fill=Platform))+
  geom_boxplot(alpha=0.6)+
  labs(x="Platform", y="Total Amount of Copies Sold",
       title= "Excluding Number 1")+
  theme(legend.position = "none")+
  theme(panel.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+ 
  theme(plot.title = element_text(hjust = 0.5, face="bold"))

To have a better glance of the data, we excluded the number one of the top to work in a better range (2~100).

#Main_Platforms

Main_Platforms <- Excluding_number1 %>%
  filter(Platform %in% c("DS","GB","NES","PS2","Wii","X360"))

ggplot(Main_Platforms, aes(x=Platform, y=Global_Sales, 
                              fill=Platform))+
  geom_boxplot(alpha=0.6)+
  labs(x="Platform", y="Total Amount of Copies Sold",
       title= "Global Sales by Platform")+
  theme(legend.position = "none")+
  theme(panel.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+ 
  theme(plot.title = element_text(hjust = 0.5, face="bold"))

Again, since over 90% of our top is represented by 6 main platforms, we focused in the most representative ones (“DS”,“GB”,“NES”,“PS2”,“Wii”,“X360”).

#Rating

ggplot(Main_Platforms, aes(x=Rating, y=Global_Sales, 
                                    fill=Rating))+
  geom_boxplot(alpha=0.6)+
  labs(x="Rating", y="Total Amount of Copies Sold",
       title= "Global Sales by Rating")+
  theme(legend.position = "none")+
  theme(panel.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+ 
  theme(plot.title = element_text(hjust = 0.5, face="bold"))

Classifying by rating, we found out that games from the category “E” (For everyone) are in the first place regarding total amount of copies sold, following “M” (Mature +17y) and “T” (Teen +13y). In the last place is “E10+” with games that are usually for kids with mild violence, if any.

#Genre

ggplot(Main_Platforms, aes(x=Genre, y=Global_Sales, 
                                    fill=Genre))+
  geom_boxplot(alpha=0.6)+
  labs(x="Genre", y="Total Amount of Copies Sold",
       title= "Global Sales by Genre")+
  theme(legend.position = "none")+
  theme(panel.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+ 
  theme(plot.title = element_text(hjust = 0.5, face="bold"))

Filtering our data by genre, the top category is “Platform” (this is a sub-genre of action video games), so we should consider “Action” category like the winner here. Following in the next places we found out “Racing”, “Sports” and “Puzzle” categories. (Although similar in some aspects, “Role Playing” is different to “Action”)

#pairs_and_correlation_by_Region

pairs(Main_Platforms[,c(3,6:10)])

cor(Main_Platforms[,c(3, 6:10)],use= "complete.obs")
##                 Year_of_Release    NA_Sales  EU_Sales    JP_Sales Other_Sales
## Year_of_Release       1.0000000 -0.50149180 0.1211181 -0.35589108  0.11268879
## NA_Sales             -0.5014918  1.00000000 0.2540051  0.21467092  0.07540978
## EU_Sales              0.1211181  0.25400505 1.0000000  0.42476773  0.13167708
## JP_Sales             -0.3558911  0.21467092 0.4247677  1.00000000 -0.05647155
## Other_Sales           0.1126888  0.07540978 0.1316771 -0.05647155  1.00000000
## Global_Sales         -0.3715746  0.83168126 0.6616454  0.56246632  0.28107735
##                 Global_Sales
## Year_of_Release   -0.3715746
## NA_Sales           0.8316813
## EU_Sales           0.6616454
## JP_Sales           0.5624663
## Other_Sales        0.2810773
## Global_Sales       1.0000000

Similar to above, regarding sales and score correlation we did not find significant results, as expected, while sales increased worldwide through the years, the quantity of “user scores” and “critic scores” grew too.

#Global_Sales_by_Platform

ggplot(Main_Platforms, aes(x=Year, y=Global_Sales))+
  labs(x="Year of Release", y="Total Amount of Copies Sold",
    title= "Global Sales of All Time by Platform")+
  geom_point()+
  facet_wrap(~Platform)+ 
  theme(plot.title = element_text(hjust = 0.5, face="bold"))

In the next stage, sorting global sales by platform trough the years, we could confirm that in the “80s” and “90s”, “GB” and “NES” platforms represented almost all sales. In the “00s” most sales belong to “DS”,“PS2” and “Wii”, and “X360” is the winner in the “10s”.

#North_America_Sales_by_Platform

ggplot(Main_Platforms, aes(x=Year, y=NA_Sales))+
  labs(x="Year of Release", y="Total Amount of Copies Sold",
       title= "North America Sales of All Time by Platform")+
  geom_point()+
  facet_wrap(~Platform)+ 
  theme(plot.title = element_text(hjust = 0.5, face="bold"))

Since “North America” region is the biggest market worldwide, 50~60% of global sales by platform belong to this region.

#Europe_Sales_by_Platform

ggplot(Main_Platforms, aes(x=Year, y=EU_Sales))+
  labs(x="Year of Release", y="Total Amount of Copies Sold",
       title= "Europe Sales of All Time by Platform")+
  geom_point()+
  facet_wrap(~Platform)+ 
  theme(plot.title = element_text(hjust = 0.5, face="bold"))

Comparing “Europe” to “North America” region, we could outline that in the “80s”, “GB” and “NES” sales were more than the double of “Europe” sales. Also, “PS2” sales in the “00s” and “X360” sales in the “10s” were superior in “North America”. However, “DS” sales were almost the same in the “00s”, around 10 million copies sold in both regions.

#Japan_Sales_by_Platform

ggplot(Main_Platforms, aes(x=Year, y=JP_Sales))+
  labs(x="Year of Release", y="Total Amount of Copies Sold",
       title= "Japan Sales of All Time by Platform")+
  geom_point()+
  facet_wrap(~Platform)+ 
  theme(plot.title = element_text(hjust = 0.5, face="bold"))

In the “80s” and “90s”, “GB” and “NES” sales in “Japan” were relatively strong (locally), but in the last years overall sales in all platforms are almost immaterial.

#Other_Regions_Sales_by_Platform

ggplot(Main_Platforms, aes(x=Year, y=Other_Sales))+
  labs(x="Year of Release", y="Total Amount of Copies Sold",
       title= "Other Regions Sales of All Time by Platform")+
  geom_point()+
  facet_wrap(~Platform)+ 
  theme(plot.title = element_text(hjust = 0.5, face="bold"))

Finally, in “Other Regions” we could find out that only in the “00s”, “PS2” sales made a significant contribution to the “global”.

#Coefficient_of_variation_Global

stdev <- sd(Main_Platforms$Global_Sales, na.rm = TRUE)
stdev
## [1] 8.230985
avg <- mean(Main_Platforms$Global_Sales, na.rm = TRUE)
avg
## [1] 16.36579
Coefvar <- (stdev/avg)*100
Coefvar
## [1] 50.29384

In the last stage, we calculated the standard deviation, mean and coefficient of variation by region and global sales. Since around 25% is an acceptable % that means a low-variance, our “global” coefficient of variation of 50.29% can be considered a medium-variance %. Hence, we can expect sales in coming years can be volatile, by any means stable.

#Coefficient_of_variation_North_America

stdev_NA <- sd(Main_Platforms$NA_Sales, na.rm = TRUE)
stdev_NA
## [1] 5.499128
avg_NA <- mean(Main_Platforms$NA_Sales, na.rm = TRUE)
avg_NA
## [1] 8.288772
Coefvar_NA <- (stdev_NA/avg_NA)*100
Coefvar_NA
## [1] 66.3443
ggplot()+
  geom_line(data = Main_Platforms, 
            mapping = aes(x=Year_of_Release, y=Global_Sales),
            color="Black")+
  geom_point(data = Main_Platforms, 
             mapping = aes(x=Year_of_Release, y=Global_Sales),
             color="Black")+
  geom_line(data = Main_Platforms, 
            mapping = aes(x=Year_of_Release, y=NA_Sales),
            color="Blue")+
  geom_point(data = Main_Platforms, 
             mapping = aes(x=Year_of_Release, y=NA_Sales),
             color="Blue")+
  labs(x="",y="Total Amount of Copies Sold", 
       title="Global vs North America Sales")+ 
  theme(plot.title = element_text(hjust = 0.5, face="bold"))

Calculating the coefficient of variation for “North America” region, we got a slightly higher % of 66.34% what means a bigger variance than “global”. Also, we created a graph where we could confirm that most of the time “North America” sales had a big contribution to the “global, except in the”90s” when contribution level decreased.

#Coefficient_of_variation_Europe

stdev_EU <- sd(Main_Platforms$EU_Sales, na.rm = TRUE)
stdev_EU
## [1] 2.83954
avg_EU <- mean(Main_Platforms$EU_Sales, na.rm = TRUE)
avg_EU
## [1] 4.283158
Coefvar_EU <- (stdev_EU/avg_EU)*100
Coefvar_EU
## [1] 66.29549
ggplot()+
  geom_line(data = Main_Platforms, 
            mapping = aes(x=Year_of_Release, y=Global_Sales),
            color="Black")+
  geom_point(data = Main_Platforms, 
             mapping = aes(x=Year_of_Release, y=Global_Sales),
             color="Black")+
  geom_line(data = Main_Platforms, 
            mapping = aes(x=Year_of_Release, y=EU_Sales),
            color="Green")+
  geom_point(data = Main_Platforms, 
             mapping = aes(x=Year_of_Release, y=EU_Sales),
             color="Green")+
  labs(x="",y="Total Amount of Copies Sold", 
       title="Global vs Europe Sales")+ 
  theme(plot.title = element_text(hjust = 0.5, face="bold"))

In the case of “Europe”, the coefficient of variation is similar to “North America” region, a 66.30%, a higher variance than “global”. In the graph, we could confirm that “Europe” contribution to the “global” was low, specially in the “80s” and “90s”.

#Coefficient_of_variation_Japan

stdev_JP <- sd(Main_Platforms$JP_Sales, na.rm = TRUE)
stdev_JP
## [1] 2.335691
avg_JP <- mean(Main_Platforms$JP_Sales, na.rm = TRUE)
avg_JP
## [1] 2.350877
Coefvar_JP <- (stdev_JP/avg_JP)*100
Coefvar_JP
## [1] 99.35404
ggplot()+
  geom_line(data = Main_Platforms, 
            mapping = aes(x=Year_of_Release, y=Global_Sales),
            color="Black")+
  geom_point(data = Main_Platforms, 
             mapping = aes(x=Year_of_Release, y=Global_Sales),
             color="Black")+
  geom_line(data = Main_Platforms, 
            mapping = aes(x=Year_of_Release, y=JP_Sales),
            color="Red")+
  geom_point(data = Main_Platforms, 
             mapping = aes(x=Year_of_Release, y=JP_Sales),
             color="Red")+
  labs(x="",y="Total Amount of Copies Sold", 
       title="Global vs Japan Sales")+ 
  theme(plot.title = element_text(hjust = 0.5, face="bold"))

In the case of “Japan”, the coefficient of variation is very high, a 99.35%. Regarding contribution to the “global”, most of the time was low.

#Coefficient_of_variation_Other_Regions

stdev_Other <- sd(Main_Platforms$Other_Sales, na.rm = TRUE)
stdev_Other
## [1] 1.656475
avg_Other <- mean(Main_Platforms$Other_Sales, na.rm = TRUE)
avg_Other
## [1] 1.44193
Coefvar_Other <- (stdev_Other/avg_Other)*100
Coefvar_Other
## [1] 114.879
ggplot()+
  geom_line(data = Main_Platforms, 
            mapping = aes(x=Year_of_Release, y=Global_Sales),
            color="Black")+
  geom_point(data = Main_Platforms, 
             mapping = aes(x=Year_of_Release, y=Global_Sales),
             color="Black")+
  geom_line(data = Main_Platforms, 
            mapping = aes(x=Year_of_Release, y=Other_Sales),
            color="Yellow")+
  geom_point(data = Main_Platforms, 
             mapping = aes(x=Year_of_Release, y=Other_Sales),
             color="Yellow")+
  labs(x="",y="Total Amount of Copies Sold", 
       title="Global vs Other Regions Sales")+ 
  theme(plot.title = element_text(hjust = 0.5, face="bold"))

Finally, calculating the coefficient of variation for “Other Regions”, we got the highest % of the group, a 114.88%. We also found out that only around 2004 it contributed to the “global” significantly, what confirmed our results above that in the “00s”, “PS2” sales in this region were material.

CONCLUSIONS

After analyzing this dataset, we got some interesting outcomes.

  • “North America” region is the biggest market and most of the time has had a big level of contribution to the “global”.

  • While “global” sales have kept increasing through the years, “Japan” and “Other regions” sales have kept a low level of contribution to the “global”.

  • As expected, in this industry technology and innovation is the most important factor. Every decade has been dominated for “one” or “two” platforms (consoles), making sales of old consoles immaterial to the “global”.

  • Independently of genre, platform, rating or decade, unexpectedly a video game can become an “outlier” making millions of sales, even if similar games in the past could not get that level of sales.

CONSIDERATIONS

We should be aware that conclusions are based on specific outcomes of this dataset. Since some platforms and best-selling video games are not included here, we can not confirm 100% that these results are accurate.

  • “Minecraft” with 238 millions of copies sold is not included in this dataset.

  • “Multi-platform”, “PC” and “Mobile” category of platforms are not included in this dataset.

  • “Japan” population is smaller than “North America” and “Europe”, so it is logical that contribution level is lower too.