Outlines :

library(ggplot2)
library(dplyr)
library(plotly)
library(gridExtra)
library(fmsb)
library(corrplot)
library(corrgram)
library(GGally)
library(caTools)
library(psych)
library(neuralnet)

Data exploration

df<-read.csv('Video_Games_Sales_as_at_30_Nov_2016.csv',sep=',')
#use head for a quick look at the columns
head(df)
##                        Name Platform Year_of_Release    Genre Publisher
## 1                Wii Sports      Wii            2006   Sports  Nintendo
## 2            Mario Kart Wii      Wii            2008   Racing  Nintendo
## 3         Wii Sports Resort      Wii            2009   Sports  Nintendo
## 4     New Super Mario Bros.       DS            2006 Platform  Nintendo
## 5                  Wii Play      Wii            2006     Misc  Nintendo
## 6 New Super Mario Bros. Wii      Wii            2009 Platform  Nintendo
##   NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales Critic_Score
## 1    41.36    28.95     3.77        8.45        82.53           76
## 2    15.67    12.75     3.79        3.28        35.50           82
## 3    15.61    10.92     3.28        2.95        32.76           80
## 4    11.28     9.14     6.50        2.88        29.79           89
## 5    13.96     9.18     2.93        2.84        28.92           58
## 6    14.42     6.94     4.70        2.24        28.31           87
##   Critic_Count User_Score User_Count Developer Rating
## 1           51          8        321  Nintendo      E
## 2           73        8.3        709  Nintendo      E
## 3           73          8        192  Nintendo      E
## 4           65        8.5        431  Nintendo      E
## 5           41        6.6        129  Nintendo      E
## 6           80        8.4        593  Nintendo      E
#use str to know the type of each colums
str(df)
## 'data.frame':    15849 obs. of  16 variables:
##  $ Name           : Factor w/ 10845 levels " Fire Emblem Fates",..: 10371 5202 10373 6275 10369 6278 6428 5201 10363 10364 ...
##  $ Platform       : Factor w/ 18 levels "3DS","DC","DS",..: 14 14 14 3 14 14 3 3 14 14 ...
##  $ Year_of_Release: Factor w/ 29 levels "1985","1988",..: 16 18 19 16 16 19 15 15 17 19 ...
##  $ Genre          : Factor w/ 12 levels "Action","Adventure",..: 11 7 11 5 4 5 10 7 11 11 ...
##  $ Publisher      : Factor w/ 527 levels "10TACLE Studios",..: 338 338 338 338 338 338 338 338 338 338 ...
##  $ NA_Sales       : num  41.4 15.7 15.6 11.3 14 ...
##  $ EU_Sales       : num  28.95 12.75 10.92 9.14 9.18 ...
##  $ JP_Sales       : num  3.77 3.79 3.28 6.5 2.93 4.7 1.93 4.13 3.6 2.53 ...
##  $ Other_Sales    : num  8.45 3.28 2.95 2.88 2.84 2.24 2.74 1.9 2.15 1.77 ...
##  $ Global_Sales   : num  82.5 35.5 32.8 29.8 28.9 ...
##  $ Critic_Score   : int  76 82 80 89 58 87 NA 91 80 80 ...
##  $ Critic_Count   : int  51 73 73 65 41 80 NA 64 63 33 ...
##  $ User_Score     : Factor w/ 93 levels "","0.2","0.3",..: 76 79 76 81 62 80 1 82 73 70 ...
##  $ User_Count     : int  321 709 192 431 129 593 NA 464 146 52 ...
##  $ Developer      : Factor w/ 1517 levels "","1C, 1C Company",..: 934 934 934 934 934 934 1 934 934 934 ...
##  $ Rating         : Factor w/ 9 levels "","AO","E","E10+",..: 3 3 3 3 3 3 1 3 3 3 ...
#use summary to get the statistic of each colum
summary(df)
##                           Name          Platform    Year_of_Release
##  Need for Speed: Most Wanted:   12   PS2    :2161   2008   :1427   
##  FIFA 14                    :    9   DS     :2150   2009   :1426   
##  LEGO Marvel Super Heroes   :    9   PS3    :1329   2010   :1255   
##  Madden NFL 07              :    9   Wii    :1319   2007   :1197   
##  Ratatouille                :    9   X360   :1260   2011   :1135   
##  Angry Birds Star Wars      :    8   PSP    :1209   2006   :1005   
##  (Other)                    :15793   (Other):6421   (Other):8404   
##           Genre                             Publisher    
##  Action      :3234   Electronic Arts             : 1354  
##  Sports      :2239   Activision                  :  953  
##  Misc        :1695   Ubisoft                     :  923  
##  Role-Playing:1389   Namco Bandai Games          :  880  
##  Shooter     :1256   Konami Digital Entertainment:  789  
##  Adventure   :1253   THQ                         :  714  
##  (Other)     :4783   (Other)                     :10236  
##     NA_Sales          EU_Sales          JP_Sales        Other_Sales      
##  Min.   : 0.0000   Min.   : 0.0000   Min.   :0.00000   Min.   : 0.00000  
##  1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.:0.00000   1st Qu.: 0.00000  
##  Median : 0.0800   Median : 0.0200   Median :0.00000   Median : 0.01000  
##  Mean   : 0.2505   Mean   : 0.1455   Mean   :0.06023   Mean   : 0.04843  
##  3rd Qu.: 0.2300   3rd Qu.: 0.1100   3rd Qu.:0.03000   3rd Qu.: 0.04000  
##  Max.   :41.3600   Max.   :28.9500   Max.   :6.50000   Max.   :10.57000  
##                                                                          
##   Global_Sales      Critic_Score    Critic_Count      User_Score  
##  Min.   : 0.0100   Min.   :13.00   Min.   :  3.00          :7878  
##  1st Qu.: 0.0600   1st Qu.:61.00   1st Qu.: 12.00   tbd    :1784  
##  Median : 0.1600   Median :72.00   Median : 22.00   7.8    : 257  
##  Mean   : 0.5049   Mean   :69.97   Mean   : 27.44   8      : 250  
##  3rd Qu.: 0.4500   3rd Qu.:80.00   3rd Qu.: 38.00   8.2    : 240  
##  Max.   :82.5300   Max.   :99.00   Max.   :112.00   8.3    : 220  
##                    NA's   :9268    NA's   :9268     (Other):5220  
##    User_Count          Developer        Rating    
##  Min.   :    4.0            :7814          :7919  
##  1st Qu.:   11.0   Ubisoft  : 155   E      :3190  
##  Median :   28.0   EA Sports: 153   T      :2346  
##  Mean   :  180.8   EA Canada: 139   M      :1277  
##  3rd Qu.:   94.0   Konami   : 119   E10+   :1105  
##  Max.   :10592.0   Capcom   : 118   EC     :   6  
##  NA's   :9662      (Other)  :7351   (Other):   6

Comments :

filter(df,Global_Sales>80)
##         Name Platform Year_of_Release  Genre Publisher NA_Sales EU_Sales
## 1 Wii Sports      Wii            2006 Sports  Nintendo    41.36    28.95
##   JP_Sales Other_Sales Global_Sales Critic_Score Critic_Count User_Score
## 1     3.77        8.45        82.53           76           51          8
##   User_Count Developer Rating
## 1        321  Nintendo      E

Data preparation

#change factor into numeric
df$year = as.numeric(as.character(df$Year_of_Release))
df$User_Score_num = as.numeric(as.character(df$User_Score))

#create new columns to regroup the Platform by manufacturers
sony<-c('PS','PS2','PS3','PS4' ,'PSP','PSV')
microsoft<-c('PC','X360','XB','XOne')
nintendo<-c('3DS','DS','GBA','GC','N64','Wii','WiiU')
sega<-c('DC')

newPlatform<-function(x){
    if (x %in% sony == TRUE) {return('SONY')}
    else if(x %in% microsoft == TRUE) {return('MICROSOFT')}
    else if(x %in% nintendo == TRUE) {return('NINTENDO')}
    else if(x %in% sega == TRUE) {return('SEGA')}
    else{return('OTHER')}
}
df$newPlatform<-sapply(df$Platform, newPlatform)

#ultimately create a new dataframe cleanup of all NA's ...
df2 <- na.omit(df)
#there are still few rows for which the Rating is an empty string
df2<-filter(df2,Rating!='')
#df2<-df[complete.cases(df),]

Global Plots

effect of NA’s on the data

g0<-ggplot(df,aes(x=reorder(Rating,Rating,function(x)-length(x)))) + geom_bar(aes(fill=newPlatform),color='black') + theme(axis.text.x = element_text(angle=90, hjust=1)) + xlab('Genre') + theme(legend.position=c(.8, .65))
g1<-ggplot(df2,aes(x=reorder(Rating,Rating,function(x)-length(x)))) + geom_bar(aes(fill=newPlatform),color='black') + theme(axis.text.x = element_text(angle=90, hjust=1)) + xlab('Genre') + theme(legend.position=c(.8, .65))
grid.arrange(g0,g1,ncol=2)

Comments :

  • The main column of the data with NA’s has no label. This column also contribute for ~ half of the data.
  • nintendo is indeed prioritizing games for children (E and E10+)

effect of the outlier on the data’s visualization

#NA's not removed, outlier not removed
ggplot(data=df,aes(x=factor(year),y=Global_Sales)) + geom_boxplot(aes(fill=factor(newPlatform))) + theme(legend.position=c(.8, .65)) + theme(axis.text.x = element_text(angle=90, hjust=1))

#NA's not removed, outlier removed, y-axis rescaled
ggplot(data=df,aes(x=factor(year),y=Global_Sales)) + geom_boxplot(aes(fill=factor(newPlatform)),outlier.shape=NA) + ylim(0,6) + theme(legend.position=c(.8, .65)) +  theme(axis.text.x = element_text(angle=90, hjust=1))

#NA's removed, outlier removed, y-axis rescaled
ggplot(data=df2,aes(x=factor(year),y=Global_Sales)) + geom_boxplot(aes(fill=factor(newPlatform)),outlier.shape=NA)+ ylim(0,6) + theme(legend.position=c(.8, .65)) +  theme(axis.text.x = element_text(angle=90, hjust=1))

Comments :

  • from a visualization point of view, the last plot (NA’s removed, no outlier) is the best one but it’s a bit of cheating

Metacritic was launched in 1999 so technically a user score, metacritic score have less meaning when looking at the scores.

#there are 31 entries for years before 1996
#filter(df,year==1985 | year==1988 | year==1992 | year==1994)
#years < 1996 removed
ggplot(data=filter(df2,year>=1996),aes(x=factor(year),y=Global_Sales)) + geom_boxplot(aes(fill=factor(newPlatform)),outlier.shape=NA) + theme(legend.position=c(.8, .65)) + ylim(0,6) + theme(axis.text.x = element_text(angle=90, hjust=1))

#Nyears before metacritic's creation removed
ggplot(data=filter(df2,year>=1999),aes(x=factor(year),y=Global_Sales)) + geom_boxplot(aes(fill=factor(newPlatform)),outlier.shape=NA) + ylim(0,4) + theme(legend.position=c(.8, .65)) +  theme(axis.text.x = element_text(angle=90, hjust=1))

Radar Plots

As in this section I’m only interested in the number of sales, I can use the full dataframe (NA’s not removed, except for years < 1999 and years > 2016)

SALES<-filter(df,year>=1999 & year<=2016)
#group selected columns by platform, according the sum of sales
SALES_res<-SALES %>% group_by(newPlatform) %>% select(NA_Sales, EU_Sales, JP_Sales, Other_Sales,Global_Sales) %>% summarise('total NA' = sum(NA_Sales),'total EU' = sum(EU_Sales),'total JP' = sum(JP_Sales),'total Other' = sum(Other_Sales), 'total Global' = sum(Global_Sales))
#for the radar plots, define the min/max of each axis
max<-c(1400,1000,500,500)
min<-rep(0,4)
temp<-rbind(max,min,SALES_res[1:4,2:5])
#define colors
colors_border=c( rgb(0.0,0.0,0.0,0.9), rgb(0.,0.,1,0.9) , rgb(0.,1.0,0.0,0.9) , rgb(1.0,0.0,0.0,0.9))
colors_in=c( rgb(0.0,0.0,0.0,0.3), rgb(0.,0.,1,0.3) , rgb(0.,1.0,0.0,0.3) , rgb(1.0,0.0,0.0,0.3))
#plot + legend
radarchart( temp , axistype=2 , pcol=colors_border , pfcol=colors_in , plwd=4 , plty=1,cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,2000,5), cglwd=0.8,vlcex=0.8 )
legend(x=0.6, y=1., legend = c("MICROSOFT","NINTENDO","SEGA","SONY"), bty = "n", pch=10 , col=colors_in , text.col = "grey", cex=.5, pt.cex=.5)

Comments

  • SONY has a very strong sale representation throughout the world
  • MICROSOFT, as expected, shows a very low number of sales in ASIA (JP) ; XBOX360/1 are apparently not the preferred machine in Japan
  • NINTENDO has its best sale number with the Japan market

Display the values

print(as.data.frame(SALES_res))
##   newPlatform total NA total EU total JP total Other total Global
## 1   MICROSOFT   932.26   497.14    14.10      127.13      1572.46
## 2    NINTENDO  1359.51   657.57   438.93      169.66      2628.35
## 3        SEGA     4.17     1.08     7.13        0.19        12.58
## 4        SONY  1316.30   959.89   377.62      433.27      3086.88

Modelling : Scores / sales

For this section I will consider the data cleaned up of NA’s. Also I noticed that the User_Score goes from 0 to 10 whereas the metacritic score Critic_Score goes from 0 to 10 so I will rescale the User_Score

#select years for which Metacritic was on
df3<-filter(df2,year>=1999)
#rescale the User_Score
df3$User_Score_num = as.numeric(as.character(df3$User_Score)) *10
#quick plot
ggplot(data=df3,aes(x=User_Score_num,y=Critic_Score)) + geom_point(aes(color=factor(newPlatform), shape=factor(Rating)),size=2,alpha=.5) + geom_smooth(method = "lm", size=.5,color="black", formula = y ~ x) 

#breakdown per Rating
ggplot(data=df3,aes(x=User_Score_num,y=Critic_Score)) + geom_point(aes(color=factor(newPlatform)),size=2,alpha=.5) + geom_smooth(method = "lm", size=.5,color="black", formula = y ~ x) + facet_wrap(~Rating)

Comments :

Predicting the Metacritic score would be difficult given the data present (and not sure if it would make sense), however we can try to predict (regression) the sales as a function of some parameters. It will remain a very naive model because the success of a game depends on other factors (Studio, franchise), nevertheless as seen by the next plots, there seems to have a correlation between the Sales and the Metacritic score.

First I define a new variable to know the type of the consoles :

portable<-c('3DS','DS','GBA','PSP','PSV')
type<-function(x){
 if (x %in% portable == TRUE) {return('PORTABLE')}
  else{return('HOME')}
}
df3$Type<-sapply(df3$Platform, type)
sonyUs<-ggplot(data=filter(df3,newPlatform=='SONY'),aes(x=Critic_Score,y=NA_Sales)) + geom_point(aes(color=Genre,shape=Type),size=3,alpha=.5) + ylim(0,10) + geom_smooth()
micUs<-ggplot(data=filter(df3,newPlatform=='MICROSOFT'),aes(x=Critic_Score,y=NA_Sales)) + geom_point(aes(color=Genre,shape=Type),size=3,alpha=.5) + ylim(0,10)+ geom_smooth()
ninUs<-ggplot(data=filter(df3,newPlatform=='NINTENDO'),aes(x=Critic_Score,y=NA_Sales)) + geom_point(aes(color=Genre,shape=Type),size=3,alpha=.5) + ylim(0,10)+ geom_smooth()

Sales in NA vs. Metacritic scores, breakdown by Genre and Type of consoles

grid.arrange(sonyUs,ncol=1)

grid.arrange(micUs,ncol=1)

grid.arrange(ninUs,ncol=1)

We can see that a higher metacritic score, apparently independently of the genre and type of console, leads to higher sales. It’s somehow expected since players look at reviews before buying a new game.

Correlation Matrix

Before making a model, we need to check if there are more corrleation (linear) between the numeric variables.

num.cols <- sapply(df3, is.numeric)
cor.data <- cor(df3[,num.cols])
corrPLOT<-corrplot(cor.data,method='ellipse')

GGally package

I found this package in replacement of plotmatrix, quite useful to makw a combo-plot (plot is the correlation between the NA_Sales for SONY, breakdown by the Type of consoles and the 2 Scores)

temp<-(df3 %>% filter(newPlatform=='SONY') %>% select(NA_Sales,Critic_Score, User_Score_num, Type))
ggpairs(temp, mapping = aes(color = Type))

Polynomial regression (1st test)

I define a function to plot the summary of a model(MSE, RMSE, R^2, plots) to make it faster to study some models :

plotRes<-function(mod){
   print(mod)
   summary(mod)
   #create DF with prediction and real values
     mod.predictions <- predict(mod,test)
     mod.res<- cbind(mod.predictions,test$NA_Sales)
     colnames(mod.res) <- c('pred','real')
     mod.res <- as.data.frame(mod.res)
     #make plots of residuals,etc...
     g1<-ggplot(data=mod.res,aes(x=pred,y=real)) + geom_point()
     g2<-ggplot(data=mod.res,aes(x=real-pred)) + geom_histogram(bins=50)
     g3<-ggplot(data=mod.res,aes(x=pred,y=real-pred)) + geom_point()
     grid.arrange(g1,g2,g3,nrow=2, ncol=2)
     #calculate metrics
     mse <- mean((mod.res$real-mod.res$pred)^2)
     rmse<-mse^0.5
     SSE = sum((mod.res$pred - mod.res$real)^2)
     SST = sum( (mean(test$NA_Sales) - mod.res$real)^2)
     R2 = 1 - SSE/SST
     sprintf("MSE: %f RMSE : %f R2 :%f", mse,rmse,R2)
}

To test the model, I split the data into training/testing samples :

set.seed(101)
split<-sample.split(df3$NA_Sales,SplitRatio=.7)
train<-subset(df3,split==T)
test<-subset(df3,split==F)

Given the shape of NA_sales vs. Critic_Score, I will try a polynomial fit (degree 3). Note I tried a linear model including almost all features :

linModel<-lm(NA_Sales ~  newPlatform + Type + Critic_Score + Rating, train)

,but the metrics were very bad (I need to go back to this later)

model<-lm(NA_Sales ~ Critic_Score + I(Critic_Score^2) + I(Critic_Score^3),train)
plotRes(model)
## 
## Call:
## lm(formula = NA_Sales ~ Critic_Score + I(Critic_Score^2) + I(Critic_Score^3), 
##     data = train)
## 
## Coefficients:
##       (Intercept)       Critic_Score  I(Critic_Score^2)  
##        -3.702e+00          2.391e-01         -4.615e-03  
## I(Critic_Score^3)  
##         2.865e-05

## [1] "MSE: 0.247124 RMSE : 0.497116 R2 :0.098404"
#look at the data and model included (test sample)
testFunc <- function(x) {model$coefficients[1] + x*model$coefficients[2] + x*x*model$coefficients[3] + x*x*x*model$coefficients[4]}
ggplot(data=test,aes(x=Critic_Score,y=NA_Sales)) + geom_point() + stat_function(fun=testFunc,color='red',size=1) + xlab("MetaCritic Score") + ylab('NA Sales')

Well … geom_smooth achieves an even better result :-(

Neural Net

Using a NN requires some work on the data, meaning converting the Categorical into numeric variable. For thid, I made dummy variables using the Psych package. Also as a first try (and because training a NN takes time) I restrained only to the Type and newPlatform variables

tmp <- select(df3,NA_Sales, Critic_Score, newPlatform, Type)
newPlat<-dummy.code(tmp$newPlatform)
newType<-dummy.code(tmp$Type)
tmp2<-cbind(tmp,newPlat,newType)
head(tmp2)
##   NA_Sales Critic_Score newPlatform     Type MICROSOFT NINTENDO SEGA SONY
## 1    41.36           76    NINTENDO     HOME         0        1    0    0
## 2    15.67           82    NINTENDO     HOME         0        1    0    0
## 3    15.61           80    NINTENDO     HOME         0        1    0    0
## 4    11.28           89    NINTENDO PORTABLE         0        1    0    0
## 5    13.96           58    NINTENDO     HOME         0        1    0    0
## 6    14.42           87    NINTENDO     HOME         0        1    0    0
##   HOME PORTABLE
## 1    1        0
## 2    1        0
## 3    1        0
## 4    0        1
## 5    1        0
## 6    1        0
tmp2<-select(tmp2,NA_Sales,Critic_Score,MICROSOFT,NINTENDO,SEGA,SONY,HOME,PORTABLE)
split<-sample.split(tmp2$NA_Sales,SplitRatio=.7)
train2<-subset(tmp2,split==T)
test2<-subset(tmp2,split==F)
n <- names(train2)
f <- as.formula(paste("NA_Sales ~", paste(n[!n %in% "NA_Sales"], collapse = " + ")))
print(f)
## NA_Sales ~ Critic_Score + MICROSOFT + NINTENDO + SEGA + SONY + 
##     HOME + PORTABLE
nn <- neuralnet(f,data=train2,hidden=3,threshold = 1,linear.output= FALSE,lifesign="minimal")
## hidden: 3    thresh: 1    rep: 1/1    steps:    2426 error: 2619.02923   time: 2.18 secs
summary(nn)
##                     Length Class      Mode    
## call                    7  -none-     call    
## response             3893  -none-     numeric 
## covariate           27251  -none-     numeric 
## model.list              2  -none-     list    
## err.fct                 1  -none-     function
## act.fct                 1  -none-     function
## linear.output           1  -none-     logical 
## data                    8  data.frame list    
## net.result              1  -none-     list    
## weights                 1  -none-     list    
## startweights            1  -none-     list    
## generalized.weights     1  -none-     list    
## result.matrix          31  -none-     numeric
plot(nn)

testing<-subset(test2,select=c("Critic_Score","MICROSOFT", "NINTENDO", "SEGA", "SONY", "HOME", "PORTABLE"))
pred<-neuralnet::compute(nn,testing)
#the net will output a normalized prediction, so we need to scale it back in order to make a meaningful comparison 
predScale <- pred$net.result*(max(test2$NA_Sales)-min(test2$NA_Sales))+min(test2$NA_Sales)

results<-data.frame(Actual=test2$NA_Sales, Prediction=pred$net.result)
results<-cbind(results,predScale)

corrNN<-ggplot(data=results,aes(x=Actual,y=predScale)) + geom_point()
diffNN<-ggplot(data=results,aes(x=Actual-predScale)) + geom_histogram(bins=100)

grid.arrange(corrNN,diffNN,ncol=2)

History :

  • version 1 : initial commit : data overview, cleanup
  • version 2 : added a radar plot for the sales per platform
  • version 3 : added outlines, Sales vs. metacritic scores, correlation matrix