Outlines :
library(ggplot2)
library(dplyr)
library(gridExtra)
library(caTools)
library(psych)
library(neuralnet)
library(NeuralNetTools)
df<-read.csv('Video_Games_Sales_as_at_30_Nov_2016.csv',sep=',')
df$year = as.numeric(as.character(df$Year_of_Release))
df$User_Score_num = as.numeric(as.character(df$User_Score))
#remove NA
df <- na.omit(df)
#there are still few rows for which the Rating is an empty string and further cleanup
df<-filter(df,Rating!='' & year>=1999)
#rescale the User_Score
df$User_Score_num = as.numeric(as.character(df$User_Score)) *10
#create new feature, whether the console is portable or not
portable<-c('3DS','DS','GBA','PSP','PSV')
type<-function(x){
if (x %in% portable == TRUE) {return('PORTABLE')}
else{return('HOME')}
}
df$Type<-sapply(df$Platform, type)
#select columns and rename Platform because of conflict with Platform fromgGenre
tmp <- select(df,NA_Sales, Critic_Score, Platform, Rating, Genre, Type)
colnames(tmp)[3]<-"Machine"
#create dummy variables
newRating<-dummy.code(tmp$Rating)
newMachine<-dummy.code(tmp$Machine)
newType<-dummy.code(tmp$Type)
newGenre<-dummy.code(tmp$Genre)
tmp2<-cbind(tmp,newRating,newMachine,newType,newGenre)
#head(tmp2)
#remove columns used for dummies and rename some other columns
tmp2<-(select(tmp2,-c(Machine:Rating:Genre:Type:V1)))
colnames(tmp2)[5]<-"E10"
colnames(tmp2)[7]<-"KA"
colnames(tmp2)[38]<-"RolePlaying"
colnames(tmp2)[11]<-"ThreeDS"
head(tmp2)
## NA_Sales Critic_Score AO E E10 EC KA M RP T ThreeDS DC DS GBA GC N64 PC
## 1 41.36 76 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 15.67 82 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 15.61 80 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 11.28 89 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
## 5 13.96 58 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 14.42 87 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## PS PS2 PS3 PS4 PSP PSV Wii WiiU X360 XB XOne HOME PORTABLE Action
## 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0
## 2 0 0 0 0 0 0 1 0 0 0 0 1 0 0
## 3 0 0 0 0 0 0 1 0 0 0 0 1 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 5 0 0 0 0 0 0 1 0 0 0 0 1 0 0
## 6 0 0 0 0 0 0 1 0 0 0 0 1 0 0
## Adventure Fighting Misc Platform Puzzle Racing RolePlaying Shooter
## 1 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 1 0 0
## 3 0 0 0 0 0 0 0 0
## 4 0 0 0 1 0 0 0 0
## 5 0 0 1 0 0 0 0 0
## 6 0 0 0 1 0 0 0 0
## Simulation Sports Strategy
## 1 0 1 0
## 2 0 0 0
## 3 0 1 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
#there is few outliers that bias the NN, so for now (I know it's not a good method) I simply remove them
ggplot(data=tmp2,aes(x=NA_Sales)) + geom_histogram(bins=100)
tmp2<-filter(tmp2,NA_Sales<10)
#scale in [0,1] the numeric columns
range01 <- function(x){(x-min(x))/(max(x)-min(x))}
tmp2$NA_Sales<-range01(tmp2$NA_Sales)
tmp2$Critic_Score<-range01(tmp2$Critic_Score)
#check the statistic --> all features within [0,1]
#summary(tmp2)
#train/test samples
split<-sample.split(tmp2$NA_Sales,SplitRatio=.7)
train<-subset(tmp2,split==T)
test<-subset(tmp2,split==F)
#create formulae for NN
n <- names(train)
f <- as.formula(paste("NA_Sales ~", paste(n[!n %in% "NA_Sales"], collapse = " + ")))
#NN with 2 hidden layers
nn <- neuralnet(f,data=train,hidden=c(10,3),threshold = 0.01,linear.output= TRUE,lifesign="minimal")
## hidden: 10, 3 thresh: 0.01 rep: 1/1 steps: 13122 error: 1.6586 time: 41.61 secs
#plot NN : NeuralNetTools allows to display the NN but it takes a very long time so I will just paste the picture in the comment section
#plotnet(nn)
#evaluate the accuracy of the model with the test sample, metrics, plots
testing<-(test[,names(test) != "NA_Sales"])
pred<-neuralnet::compute(nn,testing)
#create a datafframe with prediction/true values
results<-data.frame(Actual=test$NA_Sales, Prediction=pred$net.result)
corNN<-ggplot(data=results,aes(x=Actual,y=Prediction)) + geom_point() + geom_smooth() + geom_abline(intercept = 0, slope = 1, color="red") + xlim(0,1) + ylim(0,1)
diffNN<-ggplot(data=results,aes(x=Actual-Prediction)) + geom_histogram(bins=100)
grid.arrange(corNN,diffNN,ncol=2)
mse<-mean(results$Actual - results$Prediction)
med<-median(results$Actual - results$Prediction)
sprintf("MSE: %f MEDIAN :%f", mse,med)
## [1] "MSE: 0.000424 MEDIAN :-0.004462"
History :