DecisionTree

library(readxl)
library(rvest)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()

df = read_excel("videogames.xlsx")

smalldf <- sample_n (df,150)

head(df)

## # A tibble: 6 × 11
##    Rank Name         Platform Year  Genre   Publisher NA_Sales EU_Sales JP_Sales
##   <dbl> <chr>        <chr>    <chr> <chr>   <chr>        <dbl>    <dbl>    <dbl>
## 1     1 Wii Sports   Wii      2006  Sports  Nintendo      41.5    29.0      3.77
## 2     2 Super Mario… NES      1985  Platfo… Nintendo      29.1     3.58     6.81
## 3     3 Mario Kart … Wii      2008  Racing  Nintendo      15.8    12.9      3.79
## 4     4 Wii Sports … Wii      2009  Sports  Nintendo      15.8    11.0      3.28
## 5     5 Pokemon Red… GB       1996  Role-P… Nintendo      11.3     8.89    10.2 
## 6     6 Tetris       GB       1989  Puzzle  Nintendo      23.2     2.26     4.22
## # … with 2 more variables: Other_Sales <dbl>, Global_Sales <dbl>

library(arules)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## 
## Attaching package: 'arules'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

library(arulesViz)

transactions(df)

## Warning: Column(s) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 not logical or factor.
## Applying default discretization (see '? discretizeDF').

## Warning in discretize(x = c(29.02, 3.58, 12.88, 11.01, 8.89, 2.26, 9.23, : The calculated breaks are: 0, 0, 0.06, 29.02
##   Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.

## Warning in discretize(x = c(3.77, 6.81, 3.79, 3.28, 10.22, 4.22, 6.5, 2.93, : The calculated breaks are: 0, 0, 0.01, 10.22
##   Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.

## Warning in discretize(x = c(8.46, 0.77, 3.31, 2.96, 1, 0.58, 2.9, 2.85, : The calculated breaks are: 0, 0, 0.02, 10.57
##   Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.

## transactions in sparse format with
##  16598 transactions (rows) and
##  12170 items (columns)

I got lots of errors so lets check what columns we got error on

colnames(df)[c(2,4,5,6,7)]

## [1] "Name"      "Year"      "Genre"     "Publisher" "NA_Sales"

trans <- transactions(df)

## Warning: Column(s) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 not logical or factor.
## Applying default discretization (see '? discretizeDF').

## Warning in discretize(x = c(29.02, 3.58, 12.88, 11.01, 8.89, 2.26, 9.23, : The calculated breaks are: 0, 0, 0.06, 29.02
##   Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.

## Warning in discretize(x = c(3.77, 6.81, 3.79, 3.28, 10.22, 4.22, 6.5, 2.93, : The calculated breaks are: 0, 0, 0.01, 10.22
##   Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.

## Warning in discretize(x = c(8.46, 0.77, 3.31, 2.96, 1, 0.58, 2.9, 2.85, : The calculated breaks are: 0, 0, 0.02, 10.57
##   Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.

summary(trans)

## transactions as itemMatrix in sparse format with
##  16598 rows (elements/itemsets/transactions) and
##  12170 columns (items) and a density of 0.000903862 
## 
## most frequent items:
##       EU_Sales=[0,0.06)       JP_Sales=[0,0.01)    Other_Sales=[0,0.02) 
##                   10723                   10455                    9922 
## Other_Sales=[0.02,10.6]    JP_Sales=[0.01,10.2]                 (Other) 
##                    6676                    6143                  138659 
## 
## element (itemset/transaction) length distribution:
## sizes
##    11 
## 16598 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      11      11      11      11      11      11 
## 
## includes extended item information - examples:
##                     labels variables              levels
## 1        Rank=[1,5.53e+03)      Rank        [1,5.53e+03)
## 2 Rank=[5.53e+03,1.11e+04)      Rank [5.53e+03,1.11e+04)
## 3 Rank=[1.11e+04,1.66e+04]      Rank [1.11e+04,1.66e+04]
## 
## includes extended transaction information - examples:
##   transactionID
## 1             1
## 2             2
## 3             3

head(colnames(trans))

## [1] "Rank=[1,5.53e+03)"                     
## [2] "Rank=[5.53e+03,1.11e+04)"              
## [3] "Rank=[1.11e+04,1.66e+04]"              
## [4] "Name=.hack: Sekai no Mukou ni + Versus"
## [5] "Name=.hack//G.U. Vol.1//Rebirth"       
## [6] "Name=.hack//G.U. Vol.2//Reminisce"

inspect(trans[1:3])

##     items                      transactionID
## [1] {Rank=[1,5.53e+03),                     
##      Name=Wii Sports,                       
##      Platform=Wii,                          
##      Year=2006,                             
##      Genre=Sports,                          
##      Publisher=Nintendo,                    
##      NA_Sales=[0.16,41.5],                  
##      EU_Sales=[0.06,29],                    
##      JP_Sales=[0.01,10.2],                  
##      Other_Sales=[0.02,10.6],               
##      Global_Sales=[0.33,82.7]}             1
## [2] {Rank=[1,5.53e+03),                     
##      Name=Super Mario Bros.,                
##      Platform=NES,                          
##      Year=1985,                             
##      Genre=Platform,                        
##      Publisher=Nintendo,                    
##      NA_Sales=[0.16,41.5],                  
##      EU_Sales=[0.06,29],                    
##      JP_Sales=[0.01,10.2],                  
##      Other_Sales=[0.02,10.6],               
##      Global_Sales=[0.33,82.7]}             2
## [3] {Rank=[1,5.53e+03),                     
##      Name=Mario Kart Wii,                   
##      Platform=Wii,                          
##      Year=2008,                             
##      Genre=Racing,                          
##      Publisher=Nintendo,                    
##      NA_Sales=[0.16,41.5],                  
##      EU_Sales=[0.06,29],                    
##      JP_Sales=[0.01,10.2],                  
##      Other_Sales=[0.02,10.6],               
##      Global_Sales=[0.33,82.7]}             3

image(trans)

itemFrequencyPlot(trans,topN = 20)

# {r} # vertical <- as(trans, "tidLists") # as(vertical, "matrix")[1:10, 1:5] #