ndata <- readRDS("C:/Users/df568923/Desktop/stat/data/ndata.rds")
par_year1234 <- readRDS("C:/Users/df568923/Desktop/stat/data/par_year1234.rds")
Outline
- Questions
- Data
- Result: Analysis and Findings
- Conclusion and Discussion
Questions
- What kind of Broadgames tends to be popular?
- Relation between parameters of broadgame and the rating it gets.
- Using Machine Learning to predict the rating score.
load library
library(ggplot2)
library(readr)
library(dplyr)
library(stringr)
library(gridExtra)
library(ggrepel)
str(ndata)
'data.frame': 976 obs. of 26 variables:
$ id : chr "36235" "197405" "19427" "215613" ...
$ type : chr "boardgame" "boardgame" "boardgame" "boardgame" ...
$ name : chr "The Duke" "Tak" "Gemblo" "Tao Long: The Way of the Dragon" ...
$ yearpublished : int 2013 2017 2003 2017 2016 2017 2012 2015 2013 2019 ...
$ minplayers : int 2 2 2 2 1 2 2 2 2 2 ...
$ maxplayers : int 2 2 6 2 4 4 7 2 2 4 ...
$ playingtime : int 30 60 30 30 30 60 45 45 15 30 ...
$ minplaytime : int 30 20 30 10 30 15 45 30 15 15 ...
$ maxplaytime : int 30 60 30 30 30 60 45 45 15 30 ...
$ minage : int 13 12 6 14 13 12 8 12 0 14 ...
$ users_rated : int 3526 1708 1335 1040 927 534 743 1705 1730 271 ...
$ average_rating : num 7.44 7.73 6.83 7.03 6.8 ...
$ bayes_average_rating: num 6.87 6.63 6.14 6.12 5.98 ...
$ total_owners : int 5864 3329 1478 3239 1948 1252 1218 3607 4785 512 ...
$ total_traders : int 188 41 41 144 168 21 74 191 362 11 ...
$ total_wanters : int 310 199 148 64 45 44 45 88 19 25 ...
$ total_wishers : int 1498 1089 588 492 235 165 191 650 255 105 ...
$ total_comments : num 850 475 466 353 325 154 222 536 736 104 ...
$ total_weights : num 120 46 128 22 24 16 46 51 122 12 ...
$ average_weight : num 2.52 2.46 1.9 2.68 1.96 ...
$ types : chr "boardgame,abstracts" "boardgame,abstracts" "boardgame,abstracts" "boardgame,abstracts" ...
$ categories : chr "Abstract Strategy,Medieval" "Abstract Strategy" "Abstract Strategy" "Abstract Strategy" ...
$ mechanics : chr "Grid Movement,Tile Placement" "Grid Movement,Network and Route Building" "" "Grid Movement,Modular Board,Point to Point Movement" ...
$ family : chr "Crowdfunding: Kickstarter,The Duke,Mensa Select" "5x5 grid,Combinatorial,Connection Games,Crowdfunding: Kickstarter,Fictional Games" "Combinatorial,Crowdfunding: Kickstarter,Gemblo" "Crowdfunding: Kickstarter" ...
$ designers : chr "Jeremy Holcomb,Stephen McLaughlin" "James Ernest,Patrick Rothfuss" "Justin Oh" "Dox Lucchin,Pedro Latro" ...
$ description : chr "Levy. Maneuver. Conquer. The Duke is a dynamic, tile-based strategy game with an old-world, feudal the"| __truncated__ ""My next several hours were spent learning how to play tak. Even if I had not been nearly mad with idlenes"| __truncated__ "Gemblo is an abstract strategy game with translucent, colored pieces, each of which is made up of one to five h"| __truncated__ ""There was something formless and perfect before the Universe was born. For lack of a better name, I call "| __truncated__ ...
#makes options
options(scipen = 999)
mytheme <-
theme(
axis.text.x = element_text(
angle = 90,
size = 8,
vjust = 0.4
),
plot.title = element_text(
size = 12,
vjust = 2,
family = "Georgia",
face = "bold",
margin = margin(b = 20)
),
axis.title.y = element_text(margin = margin(r = 20)),
axis.title.x = element_text(
size = 12,
vjust = -0.35,
margin = margin(t = 20)
),
plot.background = element_rect(fill = "#EDEFF7"),
panel.background = element_rect(fill = "#EDEFF7"),
legend.background = element_rect(fill = "#EDEFF7"),
legend.title = element_text(
size = 10,
family = "Arial",
face = "bold"
),
legend.text = element_text(size = 8, family = "Arial"),
panel.grid.major = element_line(
size = 0.4,
linetype = "solid",
color = "#cccccc"
),
panel.grid.minor = element_line(size = 0),
axis.ticks = element_blank(),
plot.margin = unit(c(0.5, 1, 1, 1), "cm")
)
colors = c(
"#9E0142",
"#D53E4F",
"#F46D43" ,
"#FDAE61",
"#FEE08B",
"#e7fe8b",
"#bcfe8b",
"#8bfeb1",
"#8bc4fe",
"#8b96fe",
"#ad8bfe",
"#d98bfe",
"#fe8bd1",
"#fe8b96",
"#fb6761"
)
treemap of mechanics
library(treemap)
library(viridis)
library(splitstackshape)
ndata$mechanics <- as.character(ndata$mechanics)
tags <-
cSplit(ndata, splitCols = "mechanics", direction = "long")
tags.table <- as.data.frame(table(tags$mechanics))
treemap(
tags.table,
index = "Var1",
vSize = "Freq",
vColor = "#58ACFA",
palette = viridis(6)
)

distribution of weight
options(repr.plot.width = 3, repr.plot.height = 3)
ndata %>% ggplot(aes(x = average_weight)) + geom_density(fill = "#FDAE61", alpha =
0.4) + mytheme + labs(title = "Distribution of weight")

distribution of ages
options(repr.plot.width = 4, repr.plot.height = 4)
ndata %>% ggplot(aes(x = minage)) + geom_histogram(aes(y = ..density..), fill =
"#9E0142", alpha = 0.4) + mytheme + geom_density(col = "#F46D43") + labs(title =
"Distribution of Age")

category
options(repr.plot.width = 12, repr.plot.height = 20)
category <-
as.data.frame(table(str_trim(unlist(
strsplit(str_trim(as.character(ndata$types)), ", ")
))))
category %>% arrange(desc(Freq)) %>% ggplot(aes(x = Var1, y = Freq, fill =
Var1)) + geom_bar(stat = "identity") + coord_flip() + theme(legend.position = "") +
labs(x = "categories", y = "count", title = "Games Per Category")

distribution of different rating
options(repr.plot.width = 10, repr.plot.height = 4)
g3 <-
ndata %>% ggplot(aes(x = bayes_average_rating)) + geom_density(fill = "#FEE08B", alpha =
0.4) + mytheme + labs(title = "Distribution of Bayes Average Rating")
grid.arrange(g3, nrow = 1, ncol = 1)

options(repr.plot.width = 10, repr.plot.height = 4)
g2 <-
ndata %>% ggplot(aes(x = average_rating)) + geom_density(fill = "#FDAE61", alpha =
0.4) + mytheme + labs(title = "Distribution of Average Rating")
grid.arrange(g2, nrow = 1, ncol = 1)

popular games per average rating
options(repr.plot.width = 10, repr.plot.height = 4)
ndata %>% select(name, average_rating) %>% arrange(desc(average_rating)) %>%
top_n(20) %>% ggplot(aes(x = reorder(name, average_rating), y = average_rating)) +
geom_point(col = "#7f34f3", size = 3) + coord_flip() + geom_segment(aes(
x = name,
xend =
name,
y =
0,
yend =
average_rating
)) + ylim(c(7, 10)) + mytheme + labs(x = "Games", title = "Popular Games per average Rating")
Selecting by average_rating

popular games per bayes average rating
options(repr.plot.width = 10, repr.plot.height = 4)
ndata %>% select(name, bayes_average_rating) %>% arrange(desc(bayes_average_rating)) %>%
top_n(20) %>% ggplot(aes(x = reorder(name, bayes_average_rating), y = bayes_average_rating)) +
geom_point(col = "#7f34f3", size = 3) + coord_flip() + geom_segment(aes(
x = name,
xend =
name,
y =
0,
yend =
bayes_average_rating
)) + ylim(c(7, 10)) + mytheme + labs(x = "Games", title = "Popular Games per bayes average Rating")
Selecting by bayes_average_rating

linear model:
m1. <-
lm(
average_rating ~ types + maxplayers + minage + total_wanters + yearpublished + playingtime + users_rated + average_weight + total_owners,
data = par_year1234
)
summary(m1.)
Call:
lm(formula = average_rating ~ types + maxplayers + minage + total_wanters +
yearpublished + playingtime + users_rated + average_weight +
total_owners, data = par_year1234)
Residuals:
Min 1Q Median 3Q Max
-2.35232 -0.24534 0.01443 0.25114 2.17439
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.87067363 0.11374194 51.614 < 0.0000000000000002 ***
typesboardgame,strategygames 0.03273491 0.05892177 0.556 0.578707
maxplayers 0.01001343 0.00483840 2.070 0.038903 *
minage -0.00623653 0.00839849 -0.743 0.458017
total_wanters 0.00111332 0.00014275 7.799 0.0000000000000263 ***
yearpublished2 0.16490067 0.04966829 3.320 0.000952 ***
yearpublished3 0.33804976 0.05739377 5.890 0.0000000063147203 ***
yearpublished4 0.67470240 0.06218698 10.850 < 0.0000000000000002 ***
playingtime 0.00048708 0.00041633 1.170 0.242478
users_rated -0.00006134 0.00002301 -2.666 0.007869 **
average_weight 0.29092989 0.04547164 6.398 0.0000000003090201 ***
total_owners 0.00003987 0.00001445 2.760 0.005952 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4908 on 624 degrees of freedom
Multiple R-squared: 0.5157, Adjusted R-squared: 0.5072
F-statistic: 60.42 on 11 and 624 DF, p-value: < 0.00000000000000022
par(mfrow = c(2,2))
plot(m1.)

Polynomial regression
quadratic model:
m2s. <-
lm(
average_rating ~ types+ yearpublished+poly(maxplayers, 2) + poly(minage, 2) +
poly(total_wanters, 2)+ poly(playingtime, 2) + poly(users_rated, 2) + poly(average_weight, 2) + poly(total_owners, 2),
data = par_year1234
)
summary(m2s.)
Call:
lm(formula = average_rating ~ types + yearpublished + poly(maxplayers,
2) + poly(minage, 2) + poly(total_wanters, 2) + poly(playingtime,
2) + poly(users_rated, 2) + poly(average_weight, 2) + poly(total_owners,
2), data = par_year1234)
Residuals:
Min 1Q Median 3Q Max
-2.24115 -0.22880 -0.00857 0.23810 2.26954
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.78441 0.04968 136.560 < 0.0000000000000002 ***
typesboardgame,strategygames 0.03594 0.05835 0.616 0.538214
yearpublished2 0.17312 0.04772 3.628 0.000310 ***
yearpublished3 0.31212 0.05518 5.657 0.00000002362118310 ***
yearpublished4 0.60649 0.06055 10.016 < 0.0000000000000002 ***
poly(maxplayers, 2)1 1.34974 0.51326 2.630 0.008759 **
poly(maxplayers, 2)2 -0.23954 0.49702 -0.482 0.630013
poly(minage, 2)1 -0.41506 0.53601 -0.774 0.439017
poly(minage, 2)2 -0.54438 0.48860 -1.114 0.265643
poly(total_wanters, 2)1 7.43181 0.91520 8.120 0.00000000000000253 ***
poly(total_wanters, 2)2 -5.17844 0.63806 -8.116 0.00000000000000262 ***
poly(playingtime, 2)1 0.71776 0.72261 0.993 0.320963
poly(playingtime, 2)2 0.24389 0.60269 0.405 0.685867
poly(users_rated, 2)1 -6.36362 2.73241 -2.329 0.020184 *
poly(users_rated, 2)2 4.68346 1.38830 3.374 0.000789 ***
poly(average_weight, 2)1 5.05295 0.96378 5.243 0.00000021752684085 ***
poly(average_weight, 2)2 -0.31244 0.51904 -0.602 0.547426
poly(total_owners, 2)1 5.91401 2.43880 2.425 0.015596 *
poly(total_owners, 2)2 -1.76605 1.35128 -1.307 0.191717
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4671 on 617 degrees of freedom
Multiple R-squared: 0.5664, Adjusted R-squared: 0.5538
F-statistic: 44.78 on 18 and 617 DF, p-value: < 0.00000000000000022
par(mfrow = c(2,2))
plot(m2s.)

three-dimensional models:
m3s. <-
lm(
average_rating ~ types + yearpublished +poly(maxplayers, 3) +
poly(minage, 3) + poly(total_wanters, 3) + poly(playingtime, 3) + poly(users_rated, 3) + poly(average_weight, 3) + poly(total_owners, 3),
data = par_year1234
)
summary(m3s.)
Call:
lm(formula = average_rating ~ types + yearpublished + poly(maxplayers,
3) + poly(minage, 3) + poly(total_wanters, 3) + poly(playingtime,
3) + poly(users_rated, 3) + poly(average_weight, 3) + poly(total_owners,
3), data = par_year1234)
Residuals:
Min 1Q Median 3Q Max
-2.13758 -0.21537 0.00214 0.23257 2.30184
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.78748 0.04916 138.063 < 0.0000000000000002 ***
typesboardgame,strategygames 0.03956 0.05934 0.667 0.505293
yearpublished2 0.17642 0.04682 3.768 0.000181 ***
yearpublished3 0.31182 0.05424 5.749 0.0000000142 ***
yearpublished4 0.56972 0.06006 9.486 < 0.0000000000000002 ***
poly(maxplayers, 3)1 1.43614 0.50578 2.839 0.004670 **
poly(maxplayers, 3)2 -0.25292 0.48733 -0.519 0.603950
poly(maxplayers, 3)3 -0.45076 0.47532 -0.948 0.343342
poly(minage, 3)1 -0.17062 0.53241 -0.320 0.748717
poly(minage, 3)2 -0.52461 0.49185 -1.067 0.286571
poly(minage, 3)3 -0.40435 0.50265 -0.804 0.421452
poly(total_wanters, 3)1 9.29756 1.01430 9.166 < 0.0000000000000002 ***
poly(total_wanters, 3)2 -3.60053 0.72168 -4.989 0.0000007921 ***
poly(total_wanters, 3)3 3.74396 0.74363 5.035 0.0000006310 ***
poly(playingtime, 3)1 0.57211 0.72082 0.794 0.427682
poly(playingtime, 3)2 0.45521 0.61624 0.739 0.460384
poly(playingtime, 3)3 -1.53273 0.52647 -2.911 0.003730 **
poly(users_rated, 3)1 -7.20371 2.78451 -2.587 0.009910 **
poly(users_rated, 3)2 2.11544 1.75839 1.203 0.229422
poly(users_rated, 3)3 -4.47925 1.16471 -3.846 0.000133 ***
poly(average_weight, 3)1 4.73463 0.97591 4.851 0.0000015578 ***
poly(average_weight, 3)2 -0.68095 0.54516 -1.249 0.212109
poly(average_weight, 3)3 0.19682 0.50824 0.387 0.698706
poly(total_owners, 3)1 4.74592 2.45059 1.937 0.053250 .
poly(total_owners, 3)2 -1.90774 1.49979 -1.272 0.203857
poly(total_owners, 3)3 2.00152 1.09931 1.821 0.069142 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4545 on 610 degrees of freedom
Multiple R-squared: 0.5942, Adjusted R-squared: 0.5775
F-statistic: 35.72 on 25 and 610 DF, p-value: < 0.00000000000000022
par(mfrow = c(2,2))
plot(m3s.)

machine learning:
ndata$average_rating.high <- ndata$average_rating > 7
ndata$average_rating.high <- 1 * (ndata$average_rating.high)
dataset <- ndata
library(caret)
library(lattice)
library(ggplot2)
set.seed(509) # Set random seed
# Shuffle rows
rows <- sample(nrow(dataset))
dataset <- dataset[rows, ]
# create a list of 80% of the rows in the original dataset we can use for training
train_index <-
createDataPartition(dataset$average_rating.high, p = 0.80, list = FALSE)
# Randomly select 80% of data as the training data set
data_tr <- dataset[train_index, ]
# Assign the remaining 20% of as the validation data set
data_va <- dataset[-train_index, ]
head(data_tr, 3)
# scatterplot matrix
library(ellipse) # package ellipse is required
data_tr$types <- as.character(data_tr$types)
data_tr$average_rating.high <-
as.factor(data_tr$average_rating.high)
x <- c(4:7,10:20)
featurePlot(
x = data_tr[, x],
y = data_tr$average_rating.high,
plot = "ellipse",
## Add a key at the top
auto.key = list(columns = 2)
)

# summarize the class distribution
table.class <- table(data_tr$average_rating.high)
percentage <- prop.table(table.class) * 100
print(cbind(counts = table.class, percentage = percentage))
counts percentage
0 340 43.53393
1 441 56.46607
# Run algorithms using 10-fold cross validation
control <- trainControl(method = "cv", number = 10)
metric <- "Accuracy"
dataset$average_rating.high <-
as.factor(dataset$average_rating.high)
# a) linear algorithms
set.seed(7)
fit.lda <-
train(
average_rating.high ~ yearpublished + minplayers + maxplayers + playingtime + minage + users_rated + total_owners + total_traders + total_wanters + total_wishers + total_comments + total_weights + average_weight,
data = dataset,
method = "lda",
metric = metric,
trControl = control
)
# b) nonlinear algorithms
# CART
set.seed(7)
fit.cart <-
train(
average_rating.high ~ yearpublished + minplayers + maxplayers + playingtime + minage + users_rated + total_owners + total_traders + total_wanters + total_wishers + total_comments + total_weights + average_weight,
data = dataset,
method = "rpart",
metric = metric,
trControl = control
)
# kNN
set.seed(7)
fit.knn <-
train(
average_rating.high ~ yearpublished + minplayers + maxplayers + playingtime + minage + users_rated + total_owners + total_traders + total_wanters + total_wishers + total_comments + total_weights + average_weight,
data = dataset,
method = "knn",
metric = metric,
trControl = control
)
# c) advanced algorithms
# SVM
set.seed(7)
fit.svm <-
train(
average_rating.high ~ yearpublished + minplayers + maxplayers + playingtime + minage + users_rated + total_owners + total_traders + total_wanters + total_wishers + total_comments + total_weights + average_weight,
data = dataset,
method = "svmRadial",
metric = metric,
trControl = control
)
# Random Forest
set.seed(7)
fit.rf <-
train(
average_rating.high ~ yearpublished + minplayers + maxplayers + playingtime + minage + users_rated + total_owners + total_traders + total_wanters + total_wishers + total_comments + total_weights + average_weight,
data = dataset,
method = "rf",
metric = metric,
trControl = control
)
# d) Special guests
# XGBoost: LightGBM, etc. Friend in high places :)
set.seed(7)
fit.xgb <-
train(
average_rating.high ~ yearpublished + minplayers + maxplayers + playingtime + minage + users_rated + total_owners + total_traders + total_wanters + total_wishers + total_comments + total_weights + average_weight,
data = dataset,
method = "xgbTree",
metric = metric,
trControl = control
)
# summarize accuracy of models
results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf, xgb=fit.xgb))
summary(results)
Call:
summary.resamples(object = results)
Models: lda, cart, knn, svm, rf, xgb
Number of resamples: 10
Accuracy
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
lda 0.6836735 0.7397959 0.7845571 0.7706186 0.8020724 0.8247423 0
cart 0.6428571 0.7500000 0.7704082 0.7582790 0.7815064 0.8041237 0
knn 0.7142857 0.7216495 0.7487376 0.7612666 0.7959184 0.8453608 0
svm 0.7755102 0.8067010 0.8265306 0.8310120 0.8509363 0.9175258 0
rf 0.8163265 0.8380234 0.8564065 0.8545550 0.8673469 0.8969072 0
xgb 0.8265306 0.8392857 0.8564065 0.8597307 0.8740532 0.9175258 0
Kappa
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
lda 0.3560831 0.4729190 0.5571063 0.5334978 0.5972336 0.6440751 0
cart 0.2910294 0.4866880 0.5210937 0.5015601 0.5593714 0.5907173 0
knn 0.4198732 0.4372251 0.4939052 0.5163414 0.5866723 0.6787370 0
svm 0.5418615 0.6060157 0.6495024 0.6563757 0.6970886 0.8320346 0
rf 0.6308079 0.6760997 0.7081475 0.7056573 0.7320100 0.7900433 0
xgb 0.6468843 0.6718148 0.7078990 0.7149074 0.7442114 0.8310840 0
Conclusion and Discussion
- users_rated, average_weight, total_owners have significant effects.
- SVM, Random forest, XG bloost are good for predicting the rating score.
---
title: "BroadGames Analysis"
output: html_notebook
---

```{r}
ndata <- readRDS("C:/Users/df568923/Desktop/stat/data/ndata.rds")
par_year1234 <- readRDS("C:/Users/df568923/Desktop/stat/data/par_year1234.rds")
```
## Outline
* Questions
* Data
* Result: Analysis and Findings
* Conclusion and Discussion


## Questions
- What kind of Broadgames tends to be popular?
- Relation between parameters of broadgame and the rating it gets.
- Using Machine Learning to predict the rating score. 

## Data: Source
https://boardgamegeek.com/boardgame/174430/gloomhaven
![bgg](C:/Users/df568923/Desktop/stat/R notebook/bgg.png){width=90%}

#Data
```{r}
head(ndata,1)
```


#load library
```{r}
library(ggplot2)
library(readr)
library(dplyr)
library(stringr)
library(gridExtra)
library(ggrepel)

str(ndata)

#makes options
options(scipen = 999)
mytheme <-
  theme(
    axis.text.x = element_text(
      angle = 90,
      size = 8,
      vjust = 0.4
    ),
    plot.title = element_text(
      size = 12,
      vjust = 2,
      family = "Georgia",
      face = "bold",
      margin = margin(b = 20)
    ),
    axis.title.y = element_text(margin = margin(r = 20)),
    axis.title.x = element_text(
      size = 12,
      vjust = -0.35,
      margin = margin(t = 20)
    ),
    plot.background = element_rect(fill = "#EDEFF7"),
    panel.background = element_rect(fill = "#EDEFF7"),
    legend.background = element_rect(fill = "#EDEFF7"),
    legend.title = element_text(
      size = 10,
      family = "Arial",
      face = "bold"
    ),
    legend.text = element_text(size = 8, family = "Arial"),
    panel.grid.major = element_line(
      size = 0.4,
      linetype = "solid",
      color = "#cccccc"
    ),
    panel.grid.minor = element_line(size = 0),
    axis.ticks = element_blank(),
    plot.margin = unit(c(0.5, 1, 1, 1), "cm")
  )
colors = c(
  "#9E0142",
  "#D53E4F",
  "#F46D43" ,
  "#FDAE61",
  "#FEE08B",
  "#e7fe8b",
  "#bcfe8b",
  "#8bfeb1",
  "#8bc4fe",
  "#8b96fe",
  "#ad8bfe",
  "#d98bfe",
  "#fe8bd1",
  "#fe8b96",
  "#fb6761"
)
```

#treemap of mechanics
```{r}
library(treemap)
library(viridis)
library(splitstackshape)
ndata$mechanics <- as.character(ndata$mechanics)
tags <-
  cSplit(ndata, splitCols = "mechanics", direction = "long")
tags.table <- as.data.frame(table(tags$mechanics))

treemap(
  tags.table,
  index = "Var1",
  vSize = "Freq",
  vColor = "#58ACFA",
  palette = viridis(6)
)
```



#distribution of weight
```{r}
options(repr.plot.width = 3, repr.plot.height = 3)
ndata %>% ggplot(aes(x = average_weight)) + geom_density(fill = "#FDAE61", alpha =
                                                                0.4) + mytheme + labs(title = "Distribution of weight")
```

#distribution of ages
```{r}
options(repr.plot.width = 4, repr.plot.height = 4)
ndata %>% ggplot(aes(x = minage)) + geom_histogram(aes(y = ..density..), fill =
                                                          "#9E0142", alpha = 0.4) + mytheme + geom_density(col = "#F46D43") + labs(title =
                                                                                                                                     "Distribution of Age")
```

#category
```{r}
options(repr.plot.width = 12, repr.plot.height = 20)
category <-
  as.data.frame(table(str_trim(unlist(
    strsplit(str_trim(as.character(ndata$types)), ", ")
  ))))
category %>% arrange(desc(Freq)) %>% ggplot(aes(x = Var1, y = Freq, fill =
                                                  Var1)) + geom_bar(stat = "identity") + coord_flip() + theme(legend.position = "") +
  labs(x = "categories", y = "count", title = "Games Per Category")
```


##distribution of different rating
```{r}
options(repr.plot.width = 10, repr.plot.height = 4)

g3 <-
  ndata %>% ggplot(aes(x = bayes_average_rating)) + geom_density(fill = "#FEE08B", alpha =
                                                                        0.4) + mytheme + labs(title = "Distribution of Bayes Average Rating")

grid.arrange(g3, nrow = 1, ncol = 1)


options(repr.plot.width = 10, repr.plot.height = 4)
g2 <-
  ndata %>% ggplot(aes(x = average_rating)) + geom_density(fill = "#FDAE61", alpha =
                                                                  0.4) + mytheme + labs(title = "Distribution of Average Rating")
grid.arrange(g2, nrow = 1, ncol = 1)
```


#popular games per average rating
```{r}
options(repr.plot.width = 10, repr.plot.height = 4)
ndata %>% select(name, average_rating) %>% arrange(desc(average_rating)) %>%
  top_n(20) %>% ggplot(aes(x = reorder(name, average_rating), y = average_rating)) +
  geom_point(col = "#7f34f3", size = 3) + coord_flip() + geom_segment(aes(
    x = name,
    xend =
      name,
    y =
      0,
    yend =
      average_rating
  )) + ylim(c(7, 10)) + mytheme + labs(x = "Games", title = "Popular Games per average Rating")
```

#popular games per bayes average rating
```{r}
options(repr.plot.width = 10, repr.plot.height = 4)
ndata %>% select(name, bayes_average_rating) %>% arrange(desc(bayes_average_rating)) %>%
  top_n(20) %>% ggplot(aes(x = reorder(name, bayes_average_rating), y = bayes_average_rating)) +
  geom_point(col = "#7f34f3", size = 3) + coord_flip() + geom_segment(aes(
    x = name,
    xend =
      name,
    y =
      0,
    yend =
      bayes_average_rating
  )) + ylim(c(7, 10)) + mytheme + labs(x = "Games", title = "Popular Games per bayes average Rating")
```



#linear model:
```{r}
m1. <-
  lm(
    average_rating ~ types + maxplayers + minage + total_wanters + yearpublished + playingtime + users_rated + average_weight + total_owners,
    data = par_year1234
  )
summary(m1.)
par(mfrow = c(2,2))
plot(m1.)
```

#Polynomial regression
#quadratic model:
```{r}
m2s. <-
  lm(
    average_rating ~ types+ yearpublished+poly(maxplayers, 2) + poly(minage, 2) +
      poly(total_wanters, 2)+ poly(playingtime, 2) + poly(users_rated, 2) + poly(average_weight, 2) + poly(total_owners, 2),
    data = par_year1234
  )
summary(m2s.)
par(mfrow = c(2,2))
plot(m2s.)
```


#three-dimensional models:
```{r}
m3s. <-
  lm(
    average_rating ~ types  + yearpublished +poly(maxplayers, 3)  +
      poly(minage, 3) + poly(total_wanters, 3) + poly(playingtime, 3) + poly(users_rated, 3) + poly(average_weight, 3) + poly(total_owners, 3),
    data = par_year1234
  )
summary(m3s.)
par(mfrow = c(2,2))
plot(m3s.)
```


#machine learning:
```{r}
ndata$average_rating.high <- ndata$average_rating > 7
ndata$average_rating.high <-  1 * (ndata$average_rating.high)



dataset <- ndata
library(caret)
library(lattice)
library(ggplot2)

set.seed(509)   # Set random seed
# Shuffle rows
rows <- sample(nrow(dataset))
dataset <- dataset[rows, ]

# create a list of 80% of the rows in the original dataset we can use for training
train_index <-
  createDataPartition(dataset$average_rating.high, p = 0.80, list = FALSE)
# Randomly select 80% of data as the training data set
data_tr <- dataset[train_index, ]
# Assign the remaining 20% of as the validation data set
data_va <- dataset[-train_index, ]

head(data_tr, 3)
```

```{r}
# scatterplot matrix
library(ellipse)    # package ellipse is required
data_tr$types <- as.character(data_tr$types)
data_tr$average_rating.high <-
  as.factor(data_tr$average_rating.high)
x <- c(4:7,10:20)
featurePlot(
  x = data_tr[, x],
  y = data_tr$average_rating.high,
  plot = "ellipse",
  ## Add a key at the top
  auto.key = list(columns = 2)
)

# summarize the class distribution
table.class <- table(data_tr$average_rating.high)
percentage <- prop.table(table.class) * 100
print(cbind(counts = table.class, percentage = percentage))
```




```{r}
# Run algorithms using 10-fold cross validation
control <- trainControl(method = "cv", number = 10)
metric <- "Accuracy"


dataset$average_rating.high <-
  as.factor(dataset$average_rating.high)
# a) linear algorithms
set.seed(7)
fit.lda <-
  train(
    average_rating.high ~ yearpublished + minplayers + maxplayers + playingtime + minage + users_rated + total_owners + total_traders + total_wanters + total_wishers + total_comments + total_weights + average_weight,
    data = dataset,
    method = "lda",
    metric = metric,
    trControl = control
  )

# b) nonlinear algorithms
# CART
set.seed(7)
fit.cart <-
  train(
    average_rating.high ~ yearpublished + minplayers + maxplayers + playingtime + minage + users_rated + total_owners + total_traders + total_wanters + total_wishers + total_comments + total_weights + average_weight,
    data = dataset,
    method = "rpart",
    metric = metric,
    trControl = control
  )

# kNN
set.seed(7)
fit.knn <-
  train(
    average_rating.high ~ yearpublished + minplayers + maxplayers + playingtime + minage + users_rated + total_owners + total_traders + total_wanters + total_wishers + total_comments + total_weights + average_weight,
    data = dataset,
    method = "knn",
    metric = metric,
    trControl = control
  )

# c) advanced algorithms
# SVM
set.seed(7)
fit.svm <-
  train(
    average_rating.high ~ yearpublished + minplayers + maxplayers + playingtime + minage + users_rated + total_owners + total_traders + total_wanters + total_wishers + total_comments + total_weights + average_weight,
    data = dataset,
    method = "svmRadial",
    metric = metric,
    trControl = control
  )

# Random Forest
set.seed(7)
fit.rf <-
  train(
    average_rating.high ~ yearpublished + minplayers + maxplayers + playingtime + minage + users_rated + total_owners + total_traders + total_wanters + total_wishers + total_comments + total_weights + average_weight,
    data = dataset,
    method = "rf",
    metric = metric,
    trControl = control
  )

# d) Special guests
# XGBoost: LightGBM, etc. Friend in high places :)
set.seed(7)
fit.xgb <-
  train(
    average_rating.high ~ yearpublished + minplayers + maxplayers + playingtime + minage + users_rated + total_owners + total_traders + total_wanters + total_wishers + total_comments + total_weights + average_weight,
    data = dataset,
    method = "xgbTree",
    metric = metric,
    trControl = control
  )

# summarize accuracy of models
results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf, xgb=fit.xgb))
summary(results)
```


## Conclusion and Discussion

-  users_rated, average_weight, total_owners have significant effects.  
-  SVM, Random forest, XG bloost are good for predicting the rating score.

### References
-https://www.kaggle.com/devisangeetha/insights-geek-board-game  
-https://www.kaggle.com/maxphilipp/board-games-mechanics






