library(readxl)
library(rvest)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
df = read_excel("videogames.xlsx")
smalldf <- sample_n (df,150)
head(df)
## # A tibble: 6 × 11
## Rank Name Platform Year Genre Publisher NA_Sales EU_Sales JP_Sales
## <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 Wii Sports Wii 2006 Sports Nintendo 41.5 29.0 3.77
## 2 2 Super Mario… NES 1985 Platfo… Nintendo 29.1 3.58 6.81
## 3 3 Mario Kart … Wii 2008 Racing Nintendo 15.8 12.9 3.79
## 4 4 Wii Sports … Wii 2009 Sports Nintendo 15.8 11.0 3.28
## 5 5 Pokemon Red… GB 1996 Role-P… Nintendo 11.3 8.89 10.2
## 6 6 Tetris GB 1989 Puzzle Nintendo 23.2 2.26 4.22
## # … with 2 more variables: Other_Sales <dbl>, Global_Sales <dbl>
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
transactions(df)
## Warning: Column(s) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 not logical or factor.
## Applying default discretization (see '? discretizeDF').
## Warning in discretize(x = c(29.02, 3.58, 12.88, 11.01, 8.89, 2.26, 9.23, : The calculated breaks are: 0, 0, 0.06, 29.02
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## Warning in discretize(x = c(3.77, 6.81, 3.79, 3.28, 10.22, 4.22, 6.5, 2.93, : The calculated breaks are: 0, 0, 0.01, 10.22
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## Warning in discretize(x = c(8.46, 0.77, 3.31, 2.96, 1, 0.58, 2.9, 2.85, : The calculated breaks are: 0, 0, 0.02, 10.57
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## transactions in sparse format with
## 16598 transactions (rows) and
## 12170 items (columns)
I got lots of errors so lets check what columns we got error on
colnames(df)[c(2,4,5,6,7)]
## [1] "Name" "Year" "Genre" "Publisher" "NA_Sales"
trans <- transactions(df)
## Warning: Column(s) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 not logical or factor.
## Applying default discretization (see '? discretizeDF').
## Warning in discretize(x = c(29.02, 3.58, 12.88, 11.01, 8.89, 2.26, 9.23, : The calculated breaks are: 0, 0, 0.06, 29.02
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## Warning in discretize(x = c(3.77, 6.81, 3.79, 3.28, 10.22, 4.22, 6.5, 2.93, : The calculated breaks are: 0, 0, 0.01, 10.22
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## Warning in discretize(x = c(8.46, 0.77, 3.31, 2.96, 1, 0.58, 2.9, 2.85, : The calculated breaks are: 0, 0, 0.02, 10.57
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
summary(trans)
## transactions as itemMatrix in sparse format with
## 16598 rows (elements/itemsets/transactions) and
## 12170 columns (items) and a density of 0.000903862
##
## most frequent items:
## EU_Sales=[0,0.06) JP_Sales=[0,0.01) Other_Sales=[0,0.02)
## 10723 10455 9922
## Other_Sales=[0.02,10.6] JP_Sales=[0.01,10.2] (Other)
## 6676 6143 138659
##
## element (itemset/transaction) length distribution:
## sizes
## 11
## 16598
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11 11 11 11 11 11
##
## includes extended item information - examples:
## labels variables levels
## 1 Rank=[1,5.53e+03) Rank [1,5.53e+03)
## 2 Rank=[5.53e+03,1.11e+04) Rank [5.53e+03,1.11e+04)
## 3 Rank=[1.11e+04,1.66e+04] Rank [1.11e+04,1.66e+04]
##
## includes extended transaction information - examples:
## transactionID
## 1 1
## 2 2
## 3 3
head(colnames(trans))
## [1] "Rank=[1,5.53e+03)"
## [2] "Rank=[5.53e+03,1.11e+04)"
## [3] "Rank=[1.11e+04,1.66e+04]"
## [4] "Name=.hack: Sekai no Mukou ni + Versus"
## [5] "Name=.hack//G.U. Vol.1//Rebirth"
## [6] "Name=.hack//G.U. Vol.2//Reminisce"
inspect(trans[1:3])
## items transactionID
## [1] {Rank=[1,5.53e+03),
## Name=Wii Sports,
## Platform=Wii,
## Year=2006,
## Genre=Sports,
## Publisher=Nintendo,
## NA_Sales=[0.16,41.5],
## EU_Sales=[0.06,29],
## JP_Sales=[0.01,10.2],
## Other_Sales=[0.02,10.6],
## Global_Sales=[0.33,82.7]} 1
## [2] {Rank=[1,5.53e+03),
## Name=Super Mario Bros.,
## Platform=NES,
## Year=1985,
## Genre=Platform,
## Publisher=Nintendo,
## NA_Sales=[0.16,41.5],
## EU_Sales=[0.06,29],
## JP_Sales=[0.01,10.2],
## Other_Sales=[0.02,10.6],
## Global_Sales=[0.33,82.7]} 2
## [3] {Rank=[1,5.53e+03),
## Name=Mario Kart Wii,
## Platform=Wii,
## Year=2008,
## Genre=Racing,
## Publisher=Nintendo,
## NA_Sales=[0.16,41.5],
## EU_Sales=[0.06,29],
## JP_Sales=[0.01,10.2],
## Other_Sales=[0.02,10.6],
## Global_Sales=[0.33,82.7]} 3
image(trans)
itemFrequencyPlot(trans,topN = 20)
#
{r} # vertical <- as(trans, "tidLists") # as(vertical, "matrix")[1:10, 1:5] #