This project is the continuation of project part 1. Link for project part 1: https://rpubs.com/S_ubin10/809024

All library and packages required for the project are listed below:

library(readxl)
library(rvest)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
library(dplyr)

install.packages("rpart.plot")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
df = read_excel("videogames.xlsx") 

Loading the list of data set contains a list of video games with sales greater than 100,000 copies. Each row in the data set represents a description of a video game.

head(df)
## # A tibble: 6 × 11
##    Rank Name         Platform Year  Genre   Publisher NA_Sales EU_Sales JP_Sales
##   <dbl> <chr>        <chr>    <chr> <chr>   <chr>        <dbl>    <dbl>    <dbl>
## 1     1 Wii Sports   Wii      2006  Sports  Nintendo      41.5    29.0      3.77
## 2     2 Super Mario… NES      1985  Platfo… Nintendo      29.1     3.58     6.81
## 3     3 Mario Kart … Wii      2008  Racing  Nintendo      15.8    12.9      3.79
## 4     4 Wii Sports … Wii      2009  Sports  Nintendo      15.8    11.0      3.28
## 5     5 Pokemon Red… GB       1996  Role-P… Nintendo      11.3     8.89    10.2 
## 6     6 Tetris       GB       1989  Puzzle  Nintendo      23.2     2.26     4.22
## # … with 2 more variables: Other_Sales <dbl>, Global_Sales <dbl>

Here we a sample of 150 observations from 16598 observations. For decision tree, we take one categorical variable, Platform in this case and two quantitative variables (JP_Sale and NA_Sales) as a factors to make a decision tree.

smalldf <- sample_n (df,150)
library(rpart)
library(rpart.plot)
tree <- rpart(Platform ~ JP_Sales + NA_Sales,data = smalldf)
tree
## n= 150 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 150 126 PS2 (0.02 0.033 0.13 0.0067 0.06 0.027 0.0067 0.053 0.093 0.16 0.087 0.033 0.067 0.013 0.0067 0.0067 0.08 0.06 0.047 0.0067)  
##    2) NA_Sales< 0.115 84  68 DS (0 0.06 0.19 0 0.024 0.024 0 0.095 0.11 0.11 0.048 0.024 0.12 0.024 0.012 0 0.071 0.024 0.071 0)  
##      4) JP_Sales>=0.115 8   3 PS (0 0 0.12 0 0 0 0 0 0.62 0.13 0 0 0.12 0 0 0 0 0 0 0) *
##      5) JP_Sales< 0.115 76  61 DS (0 0.066 0.2 0 0.026 0.026 0 0.11 0.053 0.11 0.053 0.026 0.12 0.026 0.013 0 0.079 0.026 0.079 0)  
##       10) JP_Sales>=0.005 28  21 DS (0 0.18 0.25 0 0 0 0 0 0 0.071 0.036 0.036 0.25 0.071 0.036 0 0 0.036 0.036 0)  
##         20) JP_Sales>=0.035 13   8 DS (0 0.23 0.38 0 0 0 0 0 0 0.077 0 0 0.23 0 0.077 0 0 0 0 0) *
##         21) JP_Sales< 0.035 15  11 PSP (0 0.13 0.13 0 0 0 0 0 0 0.067 0.067 0.067 0.27 0.13 0 0 0 0.067 0.067 0) *
##       11) JP_Sales< 0.005 48  40 DS (0 0 0.17 0 0.042 0.042 0 0.17 0.083 0.12 0.062 0.021 0.042 0 0 0 0.12 0.021 0.1 0)  
##         22) NA_Sales< 0.005 9   3 PC (0 0 0.22 0 0 0 0 0.67 0 0 0.11 0 0 0 0 0 0 0 0 0) *
##         23) NA_Sales>=0.005 39  33 DS (0 0 0.15 0 0.051 0.051 0 0.051 0.1 0.15 0.051 0.026 0.051 0 0 0 0.15 0.026 0.13 0)  
##           46) NA_Sales>=0.085 11   6 DS (0 0 0.45 0 0 0.091 0 0 0.091 0.18 0 0 0 0 0 0 0.091 0 0.091 0) *
##           47) NA_Sales< 0.085 28  23 Wii (0 0 0.036 0 0.071 0.036 0 0.071 0.11 0.14 0.071 0.036 0.071 0 0 0 0.18 0.036 0.14 0) *
##    3) NA_Sales>=0.115 66  51 PS2 (0.045 0 0.061 0.015 0.11 0.03 0.015 0 0.076 0.23 0.14 0.045 0 0 0 0.015 0.091 0.11 0.015 0.015)  
##      6) NA_Sales>=0.255 46  39 PS3 (0.043 0 0.065 0.022 0.13 0.043 0.022 0 0.087 0.11 0.15 0.065 0 0 0 0.022 0.087 0.13 0.022 0)  
##       12) NA_Sales< 0.55 30  23 PS3 (0.067 0 0.067 0 0.17 0 0 0 0.13 0.067 0.23 0.033 0 0 0 0 0.1 0.1 0.033 0)  
##         24) NA_Sales>=0.325 21  14 PS3 (0.095 0 0.048 0 0.19 0 0 0 0.095 0.048 0.33 0 0 0 0 0 0.14 0 0.048 0) *
##         25) NA_Sales< 0.325 9   6 X360 (0 0 0.11 0 0.11 0 0 0 0.22 0.11 0 0.11 0 0 0 0 0 0.33 0 0) *
##       13) NA_Sales>=0.55 16  13 PS2 (0 0 0.062 0.063 0.062 0.13 0.063 0 0 0.19 0 0.12 0 0 0 0.063 0.063 0.19 0 0) *
##      7) NA_Sales< 0.255 20  10 PS2 (0.05 0 0.05 0 0.05 0 0 0 0.05 0.5 0.1 0 0 0 0 0 0.1 0.05 0 0.05) *

Creating a visualization for a decision tree.

rpart.plot(tree, extra = 2)
## Warning: All boxes will be white (the box.palette argument will be ignored) because
## the number of classes in the response 20 is greater than length(box.palette) 6.
## To silence this warning use box.palette=0 or trace=-1.

Making a prediction for a following variable to compate the actual value.

pred <- predict (tree,df,type = "class")
head(pred)
##   1   2   3   4   5   6 
## PS2 PS2 PS2 PS2 PS2 PS2 
## 20 Levels: 2600 3DS DS GB GBA GC NES PC PS PS2 PS3 PS4 PSP PSV SAT SNES ... XOne

Probabilities of Classification

predict(tree,smalldf)%>%
head()
##        2600       3DS         DS GB       GBA GC NES PC        PS        PS2
## 1 0.0952381 0.0000000 0.04761905  0 0.1904762  0   0  0 0.0952381 0.04761905
## 2 0.0000000 0.2307692 0.38461538  0 0.0000000  0   0  0 0.0000000 0.07692308
## 3 0.0000000 0.0000000 0.12500000  0 0.0000000  0   0  0 0.6250000 0.12500000
## 4 0.0000000 0.0000000 0.12500000  0 0.0000000  0   0  0 0.6250000 0.12500000
## 5 0.0000000 0.0000000 0.11111111  0 0.1111111  0   0  0 0.2222222 0.11111111
## 6 0.0500000 0.0000000 0.05000000  0 0.0500000  0   0  0 0.0500000 0.50000000
##         PS3       PS4       PSP PSV        SAT SNES       Wii      X360
## 1 0.3333333 0.0000000 0.0000000   0 0.00000000    0 0.1428571 0.0000000
## 2 0.0000000 0.0000000 0.2307692   0 0.07692308    0 0.0000000 0.0000000
## 3 0.0000000 0.0000000 0.1250000   0 0.00000000    0 0.0000000 0.0000000
## 4 0.0000000 0.0000000 0.1250000   0 0.00000000    0 0.0000000 0.0000000
## 5 0.0000000 0.1111111 0.0000000   0 0.00000000    0 0.0000000 0.3333333
## 6 0.1000000 0.0000000 0.0000000   0 0.00000000    0 0.1000000 0.0500000
##           XB XOne
## 1 0.04761905 0.00
## 2 0.00000000 0.00
## 3 0.00000000 0.00
## 4 0.00000000 0.00
## 5 0.00000000 0.00
## 6 0.00000000 0.05

When inspecting a classification model’s performance, a confusion table tells you the distribution of the predictions and targets. Making confusion table..

confusion_table <- with(df,table(Genre,pred))
confusion_table
##               pred
## Genre          2600  3DS   DS   GB  GBA   GC  NES   PC   PS  PS2  PS3  PS4  PSP
##   Action          0    0  482    0    0    0    0  168  180 1079  295    0  272
##   Adventure       0    0  263    0    0    0    0   85   72  156   40    0  401
##   Fighting        0    0  159    0    0    0    0    5  116  237   76    0   77
##   Misc            0    0  288    0    0    0    0  120  131  505  126    0  154
##   Platform        0    0   89    0    0    0    0   24   38  348   99    0   17
##   Puzzle          0    0   70    0    0    0    0   58   42  121   30    0   16
##   Racing          0    0  134    0    0    0    0   83   28  391  113    0   39
##   Role-Playing    0    0  299    0    0    0    0   61  300  353   73    0  171
##   Shooter         0    0  110    0    0    0    0   77   40  475  131    0   48
##   Simulation      0    0  101    0    0    0    0   87   72  214   82    0   66
##   Sports          0    0  292    0    0    0    0  143  217  785  246    0   66
##   Strategy        0    0  101    0    0    0    0  130  111   90   20    0   58
##               pred
## Genre           PSV  SAT SNES  Wii X360   XB XOne
##   Action          0    0    0  669  171    0    0
##   Adventure       0    0    0  246   23    0    0
##   Fighting        0    0    0  131   47    0    0
##   Misc            0    0    0  346   69    0    0
##   Platform        0    0    0  227   44    0    0
##   Puzzle          0    0    0  220   25    0    0
##   Racing          0    0    0  406   55    0    0
##   Role-Playing    0    0    0  182   49    0    0
##   Shooter         0    0    0  360   69    0    0
##   Simulation      0    0    0  200   45    0    0
##   Sports          0    0    0  479  118    0    0
##   Strategy        0    0    0  161   10    0    0

We used a Genre as the variable to make the confusion table.

Cross Validation

s<-sample(140,115)
train_data <- df[s,]
test_data <- df[-s,]

Now using training data to build model and then test it.

dim(train_data)
## [1] 115  11
dim(test_data)
## [1] 16483    11

Now creating a full tree for following data.

dtm<-rpart(Platform ~ JP_Sales, train_data,method ="class" )
rpart.plot(dtm,extra =2)
## Warning: All boxes will be white (the box.palette argument will be ignored) because
## the number of classes in the response 18 is greater than length(box.palette) 6.
## To silence this warning use box.palette=0 or trace=-1.

In this part we create chi-square table to examine the importance of each feature.

  library(FSelector)
  weights <- df %>% chi.squared (Platform~ .,data = .) %>%
    as_tibble(rownames = "feature")%>%
    arrange(desc(attr_importance))
  weights  
## # A tibble: 10 × 2
##    feature      attr_importance
##    <chr>                  <dbl>
##  1 Name                   0.872
##  2 Year                   0.387
##  3 Publisher              0.385
##  4 JP_Sales               0.330
##  5 EU_Sales               0.292
##  6 NA_Sales               0.271
##  7 Other_Sales            0.244
##  8 Rank                   0.234
##  9 Global_Sales           0.233
## 10 Genre                  0.180

According to the table, Genre has the importace of 0.1799156 which is the least among all the features and Name has the importance of 0.8722909 which is the highest among all.

ggplot(weights,
    aes(x=attr_importance, y =reorder(feature, attr_importance)))+
    geom_bar(stat="identity") +
    xlab("Importance score")+ ylab("Feature")

To make the decision tree, I only used three variables because compiling the tree using all the data was taking forever but I used all the variables to make the table above. So, comparing the decision tree and chi-square statistics accurately is difficult.