library(tidyverse)
library(lubridate)
library(plotly)
library(ggplot2)
library(gridExtra)
library(viridis)
summary(vgsales)
##      Name             Platform         Year_of_Release   
##  Length:16719       Length:16719       Length:16719      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##     Genre            Publisher            NA_Sales          EU_Sales     
##  Length:16719       Length:16719       Min.   : 0.0000   Min.   : 0.000  
##  Class :character   Class :character   1st Qu.: 0.0000   1st Qu.: 0.000  
##  Mode  :character   Mode  :character   Median : 0.0800   Median : 0.020  
##                                        Mean   : 0.2633   Mean   : 0.145  
##                                        3rd Qu.: 0.2400   3rd Qu.: 0.110  
##                                        Max.   :41.3600   Max.   :28.960  
##                                                                          
##     JP_Sales        Other_Sales        Global_Sales      Critic_Score  
##  Min.   : 0.0000   Min.   : 0.00000   Min.   : 0.0100   Min.   :13.00  
##  1st Qu.: 0.0000   1st Qu.: 0.00000   1st Qu.: 0.0600   1st Qu.:60.00  
##  Median : 0.0000   Median : 0.01000   Median : 0.1700   Median :71.00  
##  Mean   : 0.0776   Mean   : 0.04733   Mean   : 0.5335   Mean   :68.97  
##  3rd Qu.: 0.0400   3rd Qu.: 0.03000   3rd Qu.: 0.4700   3rd Qu.:79.00  
##  Max.   :10.2200   Max.   :10.57000   Max.   :82.5300   Max.   :98.00  
##                                                         NA's   :8582   
##   Critic_Count     User_Score          User_Count       Developer        
##  Min.   :  3.00   Length:16719       Min.   :    4.0   Length:16719      
##  1st Qu.: 12.00   Class :character   1st Qu.:   10.0   Class :character  
##  Median : 21.00   Mode  :character   Median :   24.0   Mode  :character  
##  Mean   : 26.36                      Mean   :  162.2                     
##  3rd Qu.: 36.00                      3rd Qu.:   81.0                     
##  Max.   :113.00                      Max.   :10665.0                     
##  NA's   :8582                        NA's   :9129                        
##     Rating         
##  Length:16719      
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Data Dictionary


(All sales are in units by the millions)
Name: Name of the game
Platform: Console on which the game is running
Year_of_Release: No description provided
Genre: No description provided
Publisher: No description provided
NA_Sales: Game sales in North America
EU_Sales: Game sales in the European Union
JP_Sales: Game sales in Japan
Other_Sales: Game sales in the rest of the world, i.e. Africa, Asia excluding Japan, Australia, Europe excluding the E.U. and South America
Global_Sales: Total sales in the world
Critic_Score: Aggregate score compiled by Metacritic staff
Critic_Count: The number of critics used in coming up with the Critic_score
User_Score: Score by Metacritic’s subscribers
User_Count: Number of users who gave the user_score
Developer: Party responsible for creating the game
Rating: The ESRB ratings (E.g. Everyone, Teen, Adults Only..etc)

What I wanted to Do


I wanted to use the video game sales data set from Kaggle and explore items that are best selling (year, genre, publishers, etc.)

##########################
#Total Sales by Year (in millions)
##########################
total <- aggregate(vgsales$NA_Sales, by=list(Year=vgsales$Year_of_Release), sum)
total
##    Year      x
## 1  1980  10.59
## 2  1981  33.40
## 3  1982  26.92
## 4  1983   7.76
## 5  1984  33.28
## 6  1985  33.73
## 7  1986  12.50
## 8  1987   8.46
## 9  1988  23.87
## 10 1989  45.15
## 11 1990  25.46
## 12 1991  12.76
## 13 1992  33.89
## 14 1993  16.90
## 15 1994  28.16
## 16 1995  24.83
## 17 1996  86.76
## 18 1997  94.75
## 19 1998 128.36
## 20 1999 126.06
## 21 2000  94.50
## 22 2001 173.98
## 23 2002 216.19
## 24 2003 193.61
## 25 2004 222.51
## 26 2005 242.15
## 27 2006 262.13
## 28 2007 309.89
## 29 2008 348.69
## 30 2009 335.55
## 31 2010 300.65
## 32 2011 238.79
## 33 2012 153.26
## 34 2013 153.65
## 35 2014 132.27
## 36 2015 106.86
## 37 2016  44.93
## 38 2017   0.00
## 39 2020   0.27
## 40  N/A  59.15
ggplot(total, aes(x= reorder(total$Year, - total$x), y =total$x))+ 
  geom_bar(stat = "identity", fill = "blue") + 
  theme(axis.text.x = element_text(size=8, angle=90))+
  labs(x= "Year", Y= "Total Sales")


In this plot we can see that the best selling year is 2008. A quick google search brought up plenty of articles about what transpired that year.


The way my orignal plot was set up made it appear that 2007 was the best selling year. However, a wise man told me I needed to reorder my plots for easier readability and the results have confused and surprised me so I have included two articles

https://www.cnet.com/news/video-game-sales-explode-in-industrys-best-month-ever/


https://www.gamespot.com/articles/why-2007-was-the-best-year-in-gaming/1100-6424367/

The following plot is more to satisfy my curiousity about best selling Genres

##########################
#Global Sales by Genre
##########################
totG <- aggregate(vgsales$Global_Sales, by=list(Genre=vgsales$Genre), sum)
totG
##           Genre       x
## 1        Action 1745.27
## 2     Adventure  237.69
## 3      Fighting  447.48
## 4          Misc  803.18
## 5      Platform  828.08
## 6        Puzzle  243.02
## 7        Racing  728.90
## 8  Role-Playing  934.40
## 9       Shooter 1052.94
## 10   Simulation  390.42
## 11       Sports 1332.00
## 12     Strategy  174.50
ggplot(totG, aes(x= reorder(totG$Genre, -totG$x), y =totG$x))+
         geom_bar(stat = "identity", fill = "magenta") +
         theme(axis.text.x = element_text(size=10, angle=45))+
         labs(x = "Genre", y = "Global Sales")


To be honest, this kind of surprised me because I thought that Shooters or even Sports games would be number one. Mostly due to the sheer volume of repeated titles that come out the same time every year (Call of Duty, FIFA, Madden, etc).
To me Action is exceptionally more broad than the Genres that I thought would take the #1 Spot. So I fileterd out games that are in this “Action” category just to see if I can get an idea why by just the titles"

action<-filter(vgsales, Genre == "Action")
action
## # A tibble: 3,370 x 16
##    Name        Platform Year_of_Release Genre Publisher  NA_Sales EU_Sales
##    <chr>       <chr>    <chr>           <chr> <chr>         <dbl>    <dbl>
##  1 Grand Thef~ PS3      2013            Acti~ Take-Two ~     7.02    9.09 
##  2 Grand Thef~ PS2      2004            Acti~ Take-Two ~     9.43    0.400
##  3 Grand Thef~ X360     2013            Acti~ Take-Two ~     9.66    5.14 
##  4 Grand Thef~ PS2      2002            Acti~ Take-Two ~     8.41    5.49 
##  5 Grand Thef~ PS2      2001            Acti~ Take-Two ~     6.99    4.51 
##  6 Grand Thef~ PS4      2014            Acti~ Take-Two ~     3.96    6.31 
##  7 Pokemon He~ DS       2009            Acti~ Nintendo       4.34    2.71 
##  8 Grand Thef~ X360     2008            Acti~ Take-Two ~     6.76    3.07 
##  9 Grand Thef~ PS3      2008            Acti~ Take-Two ~     4.76    3.69 
## 10 FIFA Socce~ PS3      2012            Acti~ Electroni~     1.06    5.01 
## # ... with 3,360 more rows, and 9 more variables: JP_Sales <dbl>,
## #   Other_Sales <dbl>, Global_Sales <dbl>, Critic_Score <int>,
## #   Critic_Count <int>, User_Score <chr>, User_Count <int>,
## #   Developer <chr>, Rating <chr>


Top 10 Publishers In Global Sales


Next I wanted to see who the top 10 global sales by publisher

top10 <- head(names(sort(table(vgsales$Publisher), decreasing = TRUE)),10)
TopSales <- vgsales %>%
    group_by(Publisher)%>%
    summarise(Global_Sales =n())%>%
    ungroup()%>%
    arrange(Publisher)%>%
    filter(Publisher %in% top10) %>%
    mutate(Publisher = factor(Publisher, levels = rev(top10))) %>%
    ggplot(aes(x = Publisher, y = Global_Sales)) +
    geom_col(fill = "lightsteelblue4") +
    labs(y= "Sales")+
    coord_flip()

TopSales


Nintedo has the most global sales, and Xbox360 has the highest selling game(Halo 3) during the highest selling year. I want to compare the top 10 Games for each console and see how they stack against each other.

########################################
#Creating the subsets I want to compare
########################################
Nintendo <- vgsales[which(vgsales$Platform == "Wii"), names(vgsales) %in% c("Name", "Year_of_Release", "Genre", "Global_Sales")]


xbox <- vgsales[which(vgsales$Platform == "X360"), names(vgsales) %in% c("Name", "Year_of_Release", "Genre", "Global_Sales")]


#############
#Nintendo Wii
#############
NinTop10 <- head(Nintendo[order(Nintendo$Global_Sales, decreasing=TRUE), ], 10)

NintendoSales <- NinTop10 %>%
    ggplot(aes(x = reorder(Name, +Global_Sales), y = Global_Sales)) +
    geom_col(fill = "royalblue3") +
    labs(x= "Game Title", y= "Sales")+
    coord_flip()+
    ylim(0,100)

#############
#Xbox 360
#############
Xbox10 <- head(xbox[order(xbox$Global_Sales, decreasing=TRUE), ], 10)

XboxSales <- Xbox10 %>%
    ggplot(aes(x = reorder(Name, +Global_Sales), y = Global_Sales)) +
    geom_col(fill = "springgreen3") +
    labs(x= "Game Title",y= "Sales")+
    coord_flip()+
    ylim(0,100)


grid.arrange(NintendoSales,XboxSales, ncol = 2, top = "Top 10: Nintendo Wii and Xbox 360")


At first, this kind of surprised me, but then I realized that the Wii kind of broke into a new more mainstream kind of gaming. What Nintendo was offering wasn’t just appealing to commited video game enthusiast (quite the contrary if I had to guess), but to the general masses. It was exciting new technology that the whole family could enjoy and play together and was more “active” than just sitting on the couch or in your chair.

Finally I want to compare the sales between platforms over time.

na.omit(vgsales) %>%
    group_by(Platform, Year_of_Release) %>%
    summarize(total = sum(Global_Sales)) %>%
    ggplot(aes(x = Year_of_Release, y = total, fill = Platform)) +
    theme(axis.text.x = element_text(size=10, angle=45))+
    scale_x_discrete(breaks = c("2000", "2005", "2010", "2015"))+
    geom_col(position = "stack") +
    labs(y = "Sales in $Millions", x = "Year of Release", title = "Global Sales by Platform Over Time") +
    scale_fill_viridis(discrete = TRUE)


I find this plot most intersting of all because it is almost a nice Bell Curve. The distribution in it is quite even with the exception of the spike in 2001, and then the ups and downs the following 4 years. However I have a hypothesis that those spikes are either a new and exciting platform or big title coming back after a long haiutus. Then the dip at the more recent years could be due to digital download sales.