library(tidyverse)
library(lubridate)
library(plotly)
library(ggplot2)
library(gridExtra)
library(viridis)
summary(vgsales)
## Name Platform Year_of_Release
## Length:16719 Length:16719 Length:16719
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Genre Publisher NA_Sales EU_Sales
## Length:16719 Length:16719 Min. : 0.0000 Min. : 0.000
## Class :character Class :character 1st Qu.: 0.0000 1st Qu.: 0.000
## Mode :character Mode :character Median : 0.0800 Median : 0.020
## Mean : 0.2633 Mean : 0.145
## 3rd Qu.: 0.2400 3rd Qu.: 0.110
## Max. :41.3600 Max. :28.960
##
## JP_Sales Other_Sales Global_Sales Critic_Score
## Min. : 0.0000 Min. : 0.00000 Min. : 0.0100 Min. :13.00
## 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 0.0600 1st Qu.:60.00
## Median : 0.0000 Median : 0.01000 Median : 0.1700 Median :71.00
## Mean : 0.0776 Mean : 0.04733 Mean : 0.5335 Mean :68.97
## 3rd Qu.: 0.0400 3rd Qu.: 0.03000 3rd Qu.: 0.4700 3rd Qu.:79.00
## Max. :10.2200 Max. :10.57000 Max. :82.5300 Max. :98.00
## NA's :8582
## Critic_Count User_Score User_Count Developer
## Min. : 3.00 Length:16719 Min. : 4.0 Length:16719
## 1st Qu.: 12.00 Class :character 1st Qu.: 10.0 Class :character
## Median : 21.00 Mode :character Median : 24.0 Mode :character
## Mean : 26.36 Mean : 162.2
## 3rd Qu.: 36.00 3rd Qu.: 81.0
## Max. :113.00 Max. :10665.0
## NA's :8582 NA's :9129
## Rating
## Length:16719
## Class :character
## Mode :character
##
##
##
##
I wanted to use the video game sales data set from Kaggle and explore items that are best selling (year, genre, publishers, etc.)
##########################
#Total Sales by Year (in millions)
##########################
total <- aggregate(vgsales$NA_Sales, by=list(Year=vgsales$Year_of_Release), sum)
total
## Year x
## 1 1980 10.59
## 2 1981 33.40
## 3 1982 26.92
## 4 1983 7.76
## 5 1984 33.28
## 6 1985 33.73
## 7 1986 12.50
## 8 1987 8.46
## 9 1988 23.87
## 10 1989 45.15
## 11 1990 25.46
## 12 1991 12.76
## 13 1992 33.89
## 14 1993 16.90
## 15 1994 28.16
## 16 1995 24.83
## 17 1996 86.76
## 18 1997 94.75
## 19 1998 128.36
## 20 1999 126.06
## 21 2000 94.50
## 22 2001 173.98
## 23 2002 216.19
## 24 2003 193.61
## 25 2004 222.51
## 26 2005 242.15
## 27 2006 262.13
## 28 2007 309.89
## 29 2008 348.69
## 30 2009 335.55
## 31 2010 300.65
## 32 2011 238.79
## 33 2012 153.26
## 34 2013 153.65
## 35 2014 132.27
## 36 2015 106.86
## 37 2016 44.93
## 38 2017 0.00
## 39 2020 0.27
## 40 N/A 59.15
ggplot(total, aes(x= reorder(total$Year, - total$x), y =total$x))+
geom_bar(stat = "identity", fill = "blue") +
theme(axis.text.x = element_text(size=8, angle=90))+
labs(x= "Year", Y= "Total Sales")
In this plot we can see that the best selling year is 2008. A quick google search brought up plenty of articles about what transpired that year.
The way my orignal plot was set up made it appear that 2007 was the best selling year. However, a wise man told me I needed to reorder my plots for easier readability and the results have confused and surprised me so I have included two articles
https://www.cnet.com/news/video-game-sales-explode-in-industrys-best-month-ever/
https://www.gamespot.com/articles/why-2007-was-the-best-year-in-gaming/1100-6424367/
The following plot is more to satisfy my curiousity about best selling Genres
##########################
#Global Sales by Genre
##########################
totG <- aggregate(vgsales$Global_Sales, by=list(Genre=vgsales$Genre), sum)
totG
## Genre x
## 1 Action 1745.27
## 2 Adventure 237.69
## 3 Fighting 447.48
## 4 Misc 803.18
## 5 Platform 828.08
## 6 Puzzle 243.02
## 7 Racing 728.90
## 8 Role-Playing 934.40
## 9 Shooter 1052.94
## 10 Simulation 390.42
## 11 Sports 1332.00
## 12 Strategy 174.50
ggplot(totG, aes(x= reorder(totG$Genre, -totG$x), y =totG$x))+
geom_bar(stat = "identity", fill = "magenta") +
theme(axis.text.x = element_text(size=10, angle=45))+
labs(x = "Genre", y = "Global Sales")
To be honest, this kind of surprised me because I thought that Shooters or even Sports games would be number one. Mostly due to the sheer volume of repeated titles that come out the same time every year (Call of Duty, FIFA, Madden, etc).
To me Action is exceptionally more broad than the Genres that I thought would take the #1 Spot. So I fileterd out games that are in this “Action” category just to see if I can get an idea why by just the titles"
action<-filter(vgsales, Genre == "Action")
action
## # A tibble: 3,370 x 16
## Name Platform Year_of_Release Genre Publisher NA_Sales EU_Sales
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 Grand Thef~ PS3 2013 Acti~ Take-Two ~ 7.02 9.09
## 2 Grand Thef~ PS2 2004 Acti~ Take-Two ~ 9.43 0.400
## 3 Grand Thef~ X360 2013 Acti~ Take-Two ~ 9.66 5.14
## 4 Grand Thef~ PS2 2002 Acti~ Take-Two ~ 8.41 5.49
## 5 Grand Thef~ PS2 2001 Acti~ Take-Two ~ 6.99 4.51
## 6 Grand Thef~ PS4 2014 Acti~ Take-Two ~ 3.96 6.31
## 7 Pokemon He~ DS 2009 Acti~ Nintendo 4.34 2.71
## 8 Grand Thef~ X360 2008 Acti~ Take-Two ~ 6.76 3.07
## 9 Grand Thef~ PS3 2008 Acti~ Take-Two ~ 4.76 3.69
## 10 FIFA Socce~ PS3 2012 Acti~ Electroni~ 1.06 5.01
## # ... with 3,360 more rows, and 9 more variables: JP_Sales <dbl>,
## # Other_Sales <dbl>, Global_Sales <dbl>, Critic_Score <int>,
## # Critic_Count <int>, User_Score <chr>, User_Count <int>,
## # Developer <chr>, Rating <chr>
Next I wanted to see who the top 10 global sales by publisher
top10 <- head(names(sort(table(vgsales$Publisher), decreasing = TRUE)),10)
TopSales <- vgsales %>%
group_by(Publisher)%>%
summarise(Global_Sales =n())%>%
ungroup()%>%
arrange(Publisher)%>%
filter(Publisher %in% top10) %>%
mutate(Publisher = factor(Publisher, levels = rev(top10))) %>%
ggplot(aes(x = Publisher, y = Global_Sales)) +
geom_col(fill = "lightsteelblue4") +
labs(y= "Sales")+
coord_flip()
TopSales
Nintedo has the most global sales, and Xbox360 has the highest selling game(Halo 3) during the highest selling year. I want to compare the top 10 Games for each console and see how they stack against each other.
########################################
#Creating the subsets I want to compare
########################################
Nintendo <- vgsales[which(vgsales$Platform == "Wii"), names(vgsales) %in% c("Name", "Year_of_Release", "Genre", "Global_Sales")]
xbox <- vgsales[which(vgsales$Platform == "X360"), names(vgsales) %in% c("Name", "Year_of_Release", "Genre", "Global_Sales")]
#############
#Nintendo Wii
#############
NinTop10 <- head(Nintendo[order(Nintendo$Global_Sales, decreasing=TRUE), ], 10)
NintendoSales <- NinTop10 %>%
ggplot(aes(x = reorder(Name, +Global_Sales), y = Global_Sales)) +
geom_col(fill = "royalblue3") +
labs(x= "Game Title", y= "Sales")+
coord_flip()+
ylim(0,100)
#############
#Xbox 360
#############
Xbox10 <- head(xbox[order(xbox$Global_Sales, decreasing=TRUE), ], 10)
XboxSales <- Xbox10 %>%
ggplot(aes(x = reorder(Name, +Global_Sales), y = Global_Sales)) +
geom_col(fill = "springgreen3") +
labs(x= "Game Title",y= "Sales")+
coord_flip()+
ylim(0,100)
grid.arrange(NintendoSales,XboxSales, ncol = 2, top = "Top 10: Nintendo Wii and Xbox 360")
At first, this kind of surprised me, but then I realized that the Wii kind of broke into a new more mainstream kind of gaming. What Nintendo was offering wasn’t just appealing to commited video game enthusiast (quite the contrary if I had to guess), but to the general masses. It was exciting new technology that the whole family could enjoy and play together and was more “active” than just sitting on the couch or in your chair.
Finally I want to compare the sales between platforms over time.
na.omit(vgsales) %>%
group_by(Platform, Year_of_Release) %>%
summarize(total = sum(Global_Sales)) %>%
ggplot(aes(x = Year_of_Release, y = total, fill = Platform)) +
theme(axis.text.x = element_text(size=10, angle=45))+
scale_x_discrete(breaks = c("2000", "2005", "2010", "2015"))+
geom_col(position = "stack") +
labs(y = "Sales in $Millions", x = "Year of Release", title = "Global Sales by Platform Over Time") +
scale_fill_viridis(discrete = TRUE)
I find this plot most intersting of all because it is almost a nice Bell Curve. The distribution in it is quite even with the exception of the spike in 2001, and then the ups and downs the following 4 years. However I have a hypothesis that those spikes are either a new and exciting platform or big title coming back after a long haiutus. Then the dip at the more recent years could be due to digital download sales.