Prediction of Sales

Outlines :

modelling the NA Sales with Neural Net (this is a continuation of my previous notebook)

library(ggplot2)
library(dplyr)
library(gridExtra)
library(caTools)
library(psych)
library(neuralnet)
library(NeuralNetTools)

df<-read.csv('Video_Games_Sales_as_at_30_Nov_2016.csv',sep=',')
df$year = as.numeric(as.character(df$Year_of_Release))
df$User_Score_num = as.numeric(as.character(df$User_Score))

#remove NA
df <- na.omit(df)

#there are still few rows for which the Rating is an empty string and further cleanup
df<-filter(df,Rating!='' & year>=1999)

#rescale the User_Score
df$User_Score_num = as.numeric(as.character(df$User_Score)) *10

#create new feature, whether the console is portable or not
portable<-c('3DS','DS','GBA','PSP','PSV')
type<-function(x){
  if (x %in% portable == TRUE) {return('PORTABLE')}
  else{return('HOME')}
}

df$Type<-sapply(df$Platform, type)

#select columns  and rename Platform because of conflict with Platform fromgGenre
tmp <- select(df,NA_Sales, Critic_Score, Platform, Rating, Genre, Type)
colnames(tmp)[3]<-"Machine"

#create dummy variables
newRating<-dummy.code(tmp$Rating)
newMachine<-dummy.code(tmp$Machine)
newType<-dummy.code(tmp$Type)
newGenre<-dummy.code(tmp$Genre)
tmp2<-cbind(tmp,newRating,newMachine,newType,newGenre)

#head(tmp2)

#remove columns used for dummies and rename some other columns
tmp2<-(select(tmp2,-c(Machine:Rating:Genre:Type:V1)))
colnames(tmp2)[5]<-"E10"
colnames(tmp2)[7]<-"KA"
colnames(tmp2)[38]<-"RolePlaying"
colnames(tmp2)[11]<-"ThreeDS"

head(tmp2)

##   NA_Sales Critic_Score AO E E10 EC KA M RP T ThreeDS DC DS GBA GC N64 PC
## 1    41.36           76  0 1   0  0  0 0  0 0       0  0  0   0  0   0  0
## 2    15.67           82  0 1   0  0  0 0  0 0       0  0  0   0  0   0  0
## 3    15.61           80  0 1   0  0  0 0  0 0       0  0  0   0  0   0  0
## 4    11.28           89  0 1   0  0  0 0  0 0       0  0  1   0  0   0  0
## 5    13.96           58  0 1   0  0  0 0  0 0       0  0  0   0  0   0  0
## 6    14.42           87  0 1   0  0  0 0  0 0       0  0  0   0  0   0  0
##   PS PS2 PS3 PS4 PSP PSV Wii WiiU X360 XB XOne HOME PORTABLE Action
## 1  0   0   0   0   0   0   1    0    0  0    0    1        0      0
## 2  0   0   0   0   0   0   1    0    0  0    0    1        0      0
## 3  0   0   0   0   0   0   1    0    0  0    0    1        0      0
## 4  0   0   0   0   0   0   0    0    0  0    0    0        1      0
## 5  0   0   0   0   0   0   1    0    0  0    0    1        0      0
## 6  0   0   0   0   0   0   1    0    0  0    0    1        0      0
##   Adventure Fighting Misc Platform Puzzle Racing RolePlaying Shooter
## 1         0        0    0        0      0      0           0       0
## 2         0        0    0        0      0      1           0       0
## 3         0        0    0        0      0      0           0       0
## 4         0        0    0        1      0      0           0       0
## 5         0        0    1        0      0      0           0       0
## 6         0        0    0        1      0      0           0       0
##   Simulation Sports Strategy
## 1          0      1        0
## 2          0      0        0
## 3          0      1        0
## 4          0      0        0
## 5          0      0        0
## 6          0      0        0

#there is few outliers that bias the NN, so for now (I know it's not a good method) I simply remove them
ggplot(data=tmp2,aes(x=NA_Sales)) + geom_histogram(bins=100)

tmp2<-filter(tmp2,NA_Sales<10)

#scale in [0,1] the numeric columns
range01 <- function(x){(x-min(x))/(max(x)-min(x))}
tmp2$NA_Sales<-range01(tmp2$NA_Sales)
tmp2$Critic_Score<-range01(tmp2$Critic_Score)

#check the statistic --> all features within [0,1]
#summary(tmp2)

#train/test samples
split<-sample.split(tmp2$NA_Sales,SplitRatio=.7)
train<-subset(tmp2,split==T)
test<-subset(tmp2,split==F)

#create formulae for NN

n <- names(train)
f <- as.formula(paste("NA_Sales ~", paste(n[!n %in% "NA_Sales"], collapse = " + ")))

#NN with 2 hidden layers
nn <- neuralnet(f,data=train,hidden=c(10,3),threshold = 0.01,linear.output= TRUE,lifesign="minimal")

## hidden: 10, 3    thresh: 0.01    rep: 1/1    steps:   13122  error: 1.6586   time: 41.61 secs

#plot NN : NeuralNetTools allows to display the NN but it takes a very long time so I will just paste the picture in the comment section
#plotnet(nn)

#evaluate the accuracy of the model with the test sample, metrics, plots
testing<-(test[,names(test) != "NA_Sales"])
pred<-neuralnet::compute(nn,testing)

#create a datafframe with prediction/true values
results<-data.frame(Actual=test$NA_Sales, Prediction=pred$net.result)

corNN<-ggplot(data=results,aes(x=Actual,y=Prediction)) + geom_point() + geom_smooth() + geom_abline(intercept = 0, slope = 1, color="red") + xlim(0,1) + ylim(0,1)
diffNN<-ggplot(data=results,aes(x=Actual-Prediction)) + geom_histogram(bins=100) 
grid.arrange(corNN,diffNN,ncol=2)

mse<-mean(results$Actual - results$Prediction)
med<-median(results$Actual - results$Prediction)

sprintf("MSE: %f MEDIAN :%f", mse,med)

## [1] "MSE: 0.000424 MEDIAN :-0.004462"

History :

version 1 : initial commit

Prediction of Sales

Jonathan Bouchet

December 29, 2016