Outlines :
library(ggplot2)
library(dplyr)
library(plotly)
library(gridExtra)
library(fmsb)
library(corrplot)
library(corrgram)
library(GGally)
library(caTools)
library(psych)
library(neuralnet)
df<-read.csv('Video_Games_Sales_as_at_30_Nov_2016.csv',sep=',')
#use head for a quick look at the columns
head(df)
## Name Platform Year_of_Release Genre Publisher
## 1 Wii Sports Wii 2006 Sports Nintendo
## 2 Mario Kart Wii Wii 2008 Racing Nintendo
## 3 Wii Sports Resort Wii 2009 Sports Nintendo
## 4 New Super Mario Bros. DS 2006 Platform Nintendo
## 5 Wii Play Wii 2006 Misc Nintendo
## 6 New Super Mario Bros. Wii Wii 2009 Platform Nintendo
## NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales Critic_Score
## 1 41.36 28.95 3.77 8.45 82.53 76
## 2 15.67 12.75 3.79 3.28 35.50 82
## 3 15.61 10.92 3.28 2.95 32.76 80
## 4 11.28 9.14 6.50 2.88 29.79 89
## 5 13.96 9.18 2.93 2.84 28.92 58
## 6 14.42 6.94 4.70 2.24 28.31 87
## Critic_Count User_Score User_Count Developer Rating
## 1 51 8 321 Nintendo E
## 2 73 8.3 709 Nintendo E
## 3 73 8 192 Nintendo E
## 4 65 8.5 431 Nintendo E
## 5 41 6.6 129 Nintendo E
## 6 80 8.4 593 Nintendo E
#use str to know the type of each colums
str(df)
## 'data.frame': 15849 obs. of 16 variables:
## $ Name : Factor w/ 10845 levels " Fire Emblem Fates",..: 10371 5202 10373 6275 10369 6278 6428 5201 10363 10364 ...
## $ Platform : Factor w/ 18 levels "3DS","DC","DS",..: 14 14 14 3 14 14 3 3 14 14 ...
## $ Year_of_Release: Factor w/ 29 levels "1985","1988",..: 16 18 19 16 16 19 15 15 17 19 ...
## $ Genre : Factor w/ 12 levels "Action","Adventure",..: 11 7 11 5 4 5 10 7 11 11 ...
## $ Publisher : Factor w/ 527 levels "10TACLE Studios",..: 338 338 338 338 338 338 338 338 338 338 ...
## $ NA_Sales : num 41.4 15.7 15.6 11.3 14 ...
## $ EU_Sales : num 28.95 12.75 10.92 9.14 9.18 ...
## $ JP_Sales : num 3.77 3.79 3.28 6.5 2.93 4.7 1.93 4.13 3.6 2.53 ...
## $ Other_Sales : num 8.45 3.28 2.95 2.88 2.84 2.24 2.74 1.9 2.15 1.77 ...
## $ Global_Sales : num 82.5 35.5 32.8 29.8 28.9 ...
## $ Critic_Score : int 76 82 80 89 58 87 NA 91 80 80 ...
## $ Critic_Count : int 51 73 73 65 41 80 NA 64 63 33 ...
## $ User_Score : Factor w/ 93 levels "","0.2","0.3",..: 76 79 76 81 62 80 1 82 73 70 ...
## $ User_Count : int 321 709 192 431 129 593 NA 464 146 52 ...
## $ Developer : Factor w/ 1517 levels "","1C, 1C Company",..: 934 934 934 934 934 934 1 934 934 934 ...
## $ Rating : Factor w/ 9 levels "","AO","E","E10+",..: 3 3 3 3 3 3 1 3 3 3 ...
#use summary to get the statistic of each colum
summary(df)
## Name Platform Year_of_Release
## Need for Speed: Most Wanted: 12 PS2 :2161 2008 :1427
## FIFA 14 : 9 DS :2150 2009 :1426
## LEGO Marvel Super Heroes : 9 PS3 :1329 2010 :1255
## Madden NFL 07 : 9 Wii :1319 2007 :1197
## Ratatouille : 9 X360 :1260 2011 :1135
## Angry Birds Star Wars : 8 PSP :1209 2006 :1005
## (Other) :15793 (Other):6421 (Other):8404
## Genre Publisher
## Action :3234 Electronic Arts : 1354
## Sports :2239 Activision : 953
## Misc :1695 Ubisoft : 923
## Role-Playing:1389 Namco Bandai Games : 880
## Shooter :1256 Konami Digital Entertainment: 789
## Adventure :1253 THQ : 714
## (Other) :4783 (Other) :10236
## NA_Sales EU_Sales JP_Sales Other_Sales
## Min. : 0.0000 Min. : 0.0000 Min. :0.00000 Min. : 0.00000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.00000 1st Qu.: 0.00000
## Median : 0.0800 Median : 0.0200 Median :0.00000 Median : 0.01000
## Mean : 0.2505 Mean : 0.1455 Mean :0.06023 Mean : 0.04843
## 3rd Qu.: 0.2300 3rd Qu.: 0.1100 3rd Qu.:0.03000 3rd Qu.: 0.04000
## Max. :41.3600 Max. :28.9500 Max. :6.50000 Max. :10.57000
##
## Global_Sales Critic_Score Critic_Count User_Score
## Min. : 0.0100 Min. :13.00 Min. : 3.00 :7878
## 1st Qu.: 0.0600 1st Qu.:61.00 1st Qu.: 12.00 tbd :1784
## Median : 0.1600 Median :72.00 Median : 22.00 7.8 : 257
## Mean : 0.5049 Mean :69.97 Mean : 27.44 8 : 250
## 3rd Qu.: 0.4500 3rd Qu.:80.00 3rd Qu.: 38.00 8.2 : 240
## Max. :82.5300 Max. :99.00 Max. :112.00 8.3 : 220
## NA's :9268 NA's :9268 (Other):5220
## User_Count Developer Rating
## Min. : 4.0 :7814 :7919
## 1st Qu.: 11.0 Ubisoft : 155 E :3190
## Median : 28.0 EA Sports: 153 T :2346
## Mean : 180.8 EA Canada: 139 M :1277
## 3rd Qu.: 94.0 Konami : 119 E10+ :1105
## Max. :10592.0 Capcom : 118 EC : 6
## NA's :9662 (Other) :7351 (Other): 6
Comments :
Year_of_Release
, User_Score
columns into numericCritic_Score
, Critic_Count
and User_Count
; that will have to be taken into account when looking at the ScoresSales
column that I may omit for visualization purpose :filter(df,Global_Sales>80)
## Name Platform Year_of_Release Genre Publisher NA_Sales EU_Sales
## 1 Wii Sports Wii 2006 Sports Nintendo 41.36 28.95
## JP_Sales Other_Sales Global_Sales Critic_Score Critic_Count User_Score
## 1 3.77 8.45 82.53 76 51 8
## User_Count Developer Rating
## 1 321 Nintendo E
#change factor into numeric
df$year = as.numeric(as.character(df$Year_of_Release))
df$User_Score_num = as.numeric(as.character(df$User_Score))
#create new columns to regroup the Platform by manufacturers
sony<-c('PS','PS2','PS3','PS4' ,'PSP','PSV')
microsoft<-c('PC','X360','XB','XOne')
nintendo<-c('3DS','DS','GBA','GC','N64','Wii','WiiU')
sega<-c('DC')
newPlatform<-function(x){
if (x %in% sony == TRUE) {return('SONY')}
else if(x %in% microsoft == TRUE) {return('MICROSOFT')}
else if(x %in% nintendo == TRUE) {return('NINTENDO')}
else if(x %in% sega == TRUE) {return('SEGA')}
else{return('OTHER')}
}
df$newPlatform<-sapply(df$Platform, newPlatform)
#ultimately create a new dataframe cleanup of all NA's ...
df2 <- na.omit(df)
#there are still few rows for which the Rating is an empty string
df2<-filter(df2,Rating!='')
#df2<-df[complete.cases(df),]
g0<-ggplot(df,aes(x=reorder(Rating,Rating,function(x)-length(x)))) + geom_bar(aes(fill=newPlatform),color='black') + theme(axis.text.x = element_text(angle=90, hjust=1)) + xlab('Genre') + theme(legend.position=c(.8, .65))
g1<-ggplot(df2,aes(x=reorder(Rating,Rating,function(x)-length(x)))) + geom_bar(aes(fill=newPlatform),color='black') + theme(axis.text.x = element_text(angle=90, hjust=1)) + xlab('Genre') + theme(legend.position=c(.8, .65))
grid.arrange(g0,g1,ncol=2)
Comments :
E
and E10+
)#NA's not removed, outlier not removed
ggplot(data=df,aes(x=factor(year),y=Global_Sales)) + geom_boxplot(aes(fill=factor(newPlatform))) + theme(legend.position=c(.8, .65)) + theme(axis.text.x = element_text(angle=90, hjust=1))
#NA's not removed, outlier removed, y-axis rescaled
ggplot(data=df,aes(x=factor(year),y=Global_Sales)) + geom_boxplot(aes(fill=factor(newPlatform)),outlier.shape=NA) + ylim(0,6) + theme(legend.position=c(.8, .65)) + theme(axis.text.x = element_text(angle=90, hjust=1))
#NA's removed, outlier removed, y-axis rescaled
ggplot(data=df2,aes(x=factor(year),y=Global_Sales)) + geom_boxplot(aes(fill=factor(newPlatform)),outlier.shape=NA)+ ylim(0,6) + theme(legend.position=c(.8, .65)) + theme(axis.text.x = element_text(angle=90, hjust=1))
Comments :
Metacritic was launched in 1999 so technically a user score, metacritic score have less meaning when looking at the scores.
#there are 31 entries for years before 1996
#filter(df,year==1985 | year==1988 | year==1992 | year==1994)
#years < 1996 removed
ggplot(data=filter(df2,year>=1996),aes(x=factor(year),y=Global_Sales)) + geom_boxplot(aes(fill=factor(newPlatform)),outlier.shape=NA) + theme(legend.position=c(.8, .65)) + ylim(0,6) + theme(axis.text.x = element_text(angle=90, hjust=1))
#Nyears before metacritic's creation removed
ggplot(data=filter(df2,year>=1999),aes(x=factor(year),y=Global_Sales)) + geom_boxplot(aes(fill=factor(newPlatform)),outlier.shape=NA) + ylim(0,4) + theme(legend.position=c(.8, .65)) + theme(axis.text.x = element_text(angle=90, hjust=1))
As in this section I’m only interested in the number of sales, I can use the full dataframe (NA’s not removed, except for years < 1999 and years > 2016)
SALES<-filter(df,year>=1999 & year<=2016)
#group selected columns by platform, according the sum of sales
SALES_res<-SALES %>% group_by(newPlatform) %>% select(NA_Sales, EU_Sales, JP_Sales, Other_Sales,Global_Sales) %>% summarise('total NA' = sum(NA_Sales),'total EU' = sum(EU_Sales),'total JP' = sum(JP_Sales),'total Other' = sum(Other_Sales), 'total Global' = sum(Global_Sales))
#for the radar plots, define the min/max of each axis
max<-c(1400,1000,500,500)
min<-rep(0,4)
temp<-rbind(max,min,SALES_res[1:4,2:5])
#define colors
colors_border=c( rgb(0.0,0.0,0.0,0.9), rgb(0.,0.,1,0.9) , rgb(0.,1.0,0.0,0.9) , rgb(1.0,0.0,0.0,0.9))
colors_in=c( rgb(0.0,0.0,0.0,0.3), rgb(0.,0.,1,0.3) , rgb(0.,1.0,0.0,0.3) , rgb(1.0,0.0,0.0,0.3))
#plot + legend
radarchart( temp , axistype=2 , pcol=colors_border , pfcol=colors_in , plwd=4 , plty=1,cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,2000,5), cglwd=0.8,vlcex=0.8 )
legend(x=0.6, y=1., legend = c("MICROSOFT","NINTENDO","SEGA","SONY"), bty = "n", pch=10 , col=colors_in , text.col = "grey", cex=.5, pt.cex=.5)
print(as.data.frame(SALES_res))
## newPlatform total NA total EU total JP total Other total Global
## 1 MICROSOFT 932.26 497.14 14.10 127.13 1572.46
## 2 NINTENDO 1359.51 657.57 438.93 169.66 2628.35
## 3 SEGA 4.17 1.08 7.13 0.19 12.58
## 4 SONY 1316.30 959.89 377.62 433.27 3086.88
For this section I will consider the data cleaned up of NA’s. Also I noticed that the User_Score
goes from 0 to 10 whereas the metacritic score Critic_Score
goes from 0 to 10 so I will rescale the User_Score
#select years for which Metacritic was on
df3<-filter(df2,year>=1999)
#rescale the User_Score
df3$User_Score_num = as.numeric(as.character(df3$User_Score)) *10
#quick plot
ggplot(data=df3,aes(x=User_Score_num,y=Critic_Score)) + geom_point(aes(color=factor(newPlatform), shape=factor(Rating)),size=2,alpha=.5) + geom_smooth(method = "lm", size=.5,color="black", formula = y ~ x)
#breakdown per Rating
ggplot(data=df3,aes(x=User_Score_num,y=Critic_Score)) + geom_point(aes(color=factor(newPlatform)),size=2,alpha=.5) + geom_smooth(method = "lm", size=.5,color="black", formula = y ~ x) + facet_wrap(~Rating)
Comments :
Scores
but it is not really linear.Predicting the Metacritic score would be difficult given the data present (and not sure if it would make sense), however we can try to predict (regression) the sales as a function of some parameters. It will remain a very naive model because the success of a game depends on other factors (Studio, franchise), nevertheless as seen by the next plots, there seems to have a correlation between the Sales
and the Metacritic score
.
First I define a new variable to know the type of the consoles :
portable<-c('3DS','DS','GBA','PSP','PSV')
type<-function(x){
if (x %in% portable == TRUE) {return('PORTABLE')}
else{return('HOME')}
}
df3$Type<-sapply(df3$Platform, type)
sonyUs<-ggplot(data=filter(df3,newPlatform=='SONY'),aes(x=Critic_Score,y=NA_Sales)) + geom_point(aes(color=Genre,shape=Type),size=3,alpha=.5) + ylim(0,10) + geom_smooth()
micUs<-ggplot(data=filter(df3,newPlatform=='MICROSOFT'),aes(x=Critic_Score,y=NA_Sales)) + geom_point(aes(color=Genre,shape=Type),size=3,alpha=.5) + ylim(0,10)+ geom_smooth()
ninUs<-ggplot(data=filter(df3,newPlatform=='NINTENDO'),aes(x=Critic_Score,y=NA_Sales)) + geom_point(aes(color=Genre,shape=Type),size=3,alpha=.5) + ylim(0,10)+ geom_smooth()
grid.arrange(sonyUs,ncol=1)
grid.arrange(micUs,ncol=1)
grid.arrange(ninUs,ncol=1)
We can see that a higher metacritic score, apparently independently of the genre and type of console, leads to higher sales. It’s somehow expected since players look at reviews before buying a new game.
Before making a model, we need to check if there are more corrleation (linear) between the numeric variables.
num.cols <- sapply(df3, is.numeric)
cor.data <- cor(df3[,num.cols])
corrPLOT<-corrplot(cor.data,method='ellipse')
GGally
packageI found this package in replacement of plotmatrix
, quite useful to makw a combo-plot (plot is the correlation between the NA_Sales for SONY, breakdown by the Type
of consoles and the 2 Scores)
temp<-(df3 %>% filter(newPlatform=='SONY') %>% select(NA_Sales,Critic_Score, User_Score_num, Type))
ggpairs(temp, mapping = aes(color = Type))
I define a function to plot the summary of a model(MSE, RMSE, R^2, plots) to make it faster to study some models :
plotRes<-function(mod){
print(mod)
summary(mod)
#create DF with prediction and real values
mod.predictions <- predict(mod,test)
mod.res<- cbind(mod.predictions,test$NA_Sales)
colnames(mod.res) <- c('pred','real')
mod.res <- as.data.frame(mod.res)
#make plots of residuals,etc...
g1<-ggplot(data=mod.res,aes(x=pred,y=real)) + geom_point()
g2<-ggplot(data=mod.res,aes(x=real-pred)) + geom_histogram(bins=50)
g3<-ggplot(data=mod.res,aes(x=pred,y=real-pred)) + geom_point()
grid.arrange(g1,g2,g3,nrow=2, ncol=2)
#calculate metrics
mse <- mean((mod.res$real-mod.res$pred)^2)
rmse<-mse^0.5
SSE = sum((mod.res$pred - mod.res$real)^2)
SST = sum( (mean(test$NA_Sales) - mod.res$real)^2)
R2 = 1 - SSE/SST
sprintf("MSE: %f RMSE : %f R2 :%f", mse,rmse,R2)
}
To test the model, I split the data into training/testing samples :
set.seed(101)
split<-sample.split(df3$NA_Sales,SplitRatio=.7)
train<-subset(df3,split==T)
test<-subset(df3,split==F)
Given the shape of NA_sales
vs. Critic_Score
, I will try a polynomial fit (degree 3). Note I tried a linear model including almost all features :
linModel<-lm(NA_Sales ~ newPlatform + Type + Critic_Score + Rating, train)
,but the metrics were very bad (I need to go back to this later)
model<-lm(NA_Sales ~ Critic_Score + I(Critic_Score^2) + I(Critic_Score^3),train)
plotRes(model)
##
## Call:
## lm(formula = NA_Sales ~ Critic_Score + I(Critic_Score^2) + I(Critic_Score^3),
## data = train)
##
## Coefficients:
## (Intercept) Critic_Score I(Critic_Score^2)
## -3.702e+00 2.391e-01 -4.615e-03
## I(Critic_Score^3)
## 2.865e-05
## [1] "MSE: 0.247124 RMSE : 0.497116 R2 :0.098404"
#look at the data and model included (test sample)
testFunc <- function(x) {model$coefficients[1] + x*model$coefficients[2] + x*x*model$coefficients[3] + x*x*x*model$coefficients[4]}
ggplot(data=test,aes(x=Critic_Score,y=NA_Sales)) + geom_point() + stat_function(fun=testFunc,color='red',size=1) + xlab("MetaCritic Score") + ylab('NA Sales')
Well … geom_smooth
achieves an even better result :-(
Using a NN requires some work on the data, meaning converting the Categorical into numeric variable. For thid, I made dummy variables using the Psych package. Also as a first try (and because training a NN takes time) I restrained only to the Type and newPlatform variables
tmp <- select(df3,NA_Sales, Critic_Score, newPlatform, Type)
newPlat<-dummy.code(tmp$newPlatform)
newType<-dummy.code(tmp$Type)
tmp2<-cbind(tmp,newPlat,newType)
head(tmp2)
## NA_Sales Critic_Score newPlatform Type MICROSOFT NINTENDO SEGA SONY
## 1 41.36 76 NINTENDO HOME 0 1 0 0
## 2 15.67 82 NINTENDO HOME 0 1 0 0
## 3 15.61 80 NINTENDO HOME 0 1 0 0
## 4 11.28 89 NINTENDO PORTABLE 0 1 0 0
## 5 13.96 58 NINTENDO HOME 0 1 0 0
## 6 14.42 87 NINTENDO HOME 0 1 0 0
## HOME PORTABLE
## 1 1 0
## 2 1 0
## 3 1 0
## 4 0 1
## 5 1 0
## 6 1 0
tmp2<-select(tmp2,NA_Sales,Critic_Score,MICROSOFT,NINTENDO,SEGA,SONY,HOME,PORTABLE)
split<-sample.split(tmp2$NA_Sales,SplitRatio=.7)
train2<-subset(tmp2,split==T)
test2<-subset(tmp2,split==F)
n <- names(train2)
f <- as.formula(paste("NA_Sales ~", paste(n[!n %in% "NA_Sales"], collapse = " + ")))
print(f)
## NA_Sales ~ Critic_Score + MICROSOFT + NINTENDO + SEGA + SONY +
## HOME + PORTABLE
nn <- neuralnet(f,data=train2,hidden=3,threshold = 1,linear.output= FALSE,lifesign="minimal")
## hidden: 3 thresh: 1 rep: 1/1 steps: 2426 error: 2619.02923 time: 2.18 secs
summary(nn)
## Length Class Mode
## call 7 -none- call
## response 3893 -none- numeric
## covariate 27251 -none- numeric
## model.list 2 -none- list
## err.fct 1 -none- function
## act.fct 1 -none- function
## linear.output 1 -none- logical
## data 8 data.frame list
## net.result 1 -none- list
## weights 1 -none- list
## startweights 1 -none- list
## generalized.weights 1 -none- list
## result.matrix 31 -none- numeric
plot(nn)
testing<-subset(test2,select=c("Critic_Score","MICROSOFT", "NINTENDO", "SEGA", "SONY", "HOME", "PORTABLE"))
pred<-neuralnet::compute(nn,testing)
#the net will output a normalized prediction, so we need to scale it back in order to make a meaningful comparison
predScale <- pred$net.result*(max(test2$NA_Sales)-min(test2$NA_Sales))+min(test2$NA_Sales)
results<-data.frame(Actual=test2$NA_Sales, Prediction=pred$net.result)
results<-cbind(results,predScale)
corrNN<-ggplot(data=results,aes(x=Actual,y=predScale)) + geom_point()
diffNN<-ggplot(data=results,aes(x=Actual-predScale)) + geom_histogram(bins=100)
grid.arrange(corrNN,diffNN,ncol=2)
History :
Comments