Data: Video Games Rating by ‘ESRB’
Task: Predict the rating for each game in test data
Research Question: What are the the key features that can predict the [ESRB rating] (https://www.esrb.org/ratings-guide/) of a game?
library(tidyverse)
library(skimr)
library(ggsci)
library(Hmisc)
library(corrplot)
library(FactoMineR)
library(factoextra)
library(rpart)
library(rpart.plot)
library(rattle)
library(caret)
library(randomForest)
library(xgboost)
library(pROC)
train_data= readr::read_csv("Video_games_esrb_rating.csv")
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────
cols(
.default = col_double(),
title = col_character(),
esrb_rating = col_character()
)
ℹ Use `spec()` for the full column specifications.
test_data= readr::read_csv("test_esrb.csv")
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────
cols(
.default = col_double(),
title = col_character(),
esrb_rating = col_character()
)
ℹ Use `spec()` for the full column specifications.
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 1895
Number of columns 34
_______________________
Column type frequency:
factor 34
________________________
Group variables None
── Variable type: factor ─────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate ordered n_unique top_counts
1 title 0 1 FALSE 1890 Bri: 2, Kil: 2, Mut: 2, Mys: 2
2 console 0 1 FALSE 2 0: 994, 1: 901
3 alcohol_reference 0 1 FALSE 2 0: 1800, 1: 95
4 animated_blood 0 1 FALSE 2 0: 1876, 1: 19
5 blood 0 1 FALSE 2 0: 1463, 1: 432
6 blood_and_gore 0 1 FALSE 2 0: 1656, 1: 239
7 cartoon_violence 0 1 FALSE 2 0: 1858, 1: 37
8 crude_humor 0 1 FALSE 2 0: 1792, 1: 103
9 drug_reference 0 1 FALSE 2 0: 1829, 1: 66
10 fantasy_violence 0 1 FALSE 2 0: 1477, 1: 418
11 intense_violence 0 1 FALSE 2 0: 1671, 1: 224
12 language 0 1 FALSE 2 0: 1690, 1: 205
13 lyrics 0 1 FALSE 2 0: 1832, 1: 63
14 mature_humor 0 1 FALSE 2 0: 1873, 1: 22
15 mild_blood 0 1 FALSE 2 0: 1763, 1: 132
16 mild_cartoon_violence 0 1 FALSE 2 0: 1850, 1: 45
17 mild_fantasy_violence 0 1 FALSE 2 0: 1804, 1: 91
18 mild_language 0 1 FALSE 2 0: 1854, 1: 41
19 mild_lyrics 0 1 FALSE 2 0: 1749, 1: 146
20 mild_suggestive_themes 0 1 FALSE 2 0: 1811, 1: 84
21 mild_violence 0 1 FALSE 2 0: 1805, 1: 90
22 no_descriptors 0 1 FALSE 2 0: 1573, 1: 322
23 nudity 0 1 FALSE 2 0: 1867, 1: 28
24 partial_nudity 0 1 FALSE 2 0: 1870, 1: 25
25 sexual_content 0 1 FALSE 2 0: 1830, 1: 65
26 sexual_themes 0 1 FALSE 2 0: 1786, 1: 109
27 simulated_gambling 0 1 FALSE 2 0: 1768, 1: 127
28 strong_janguage 0 1 FALSE 2 0: 1671, 1: 224
29 strong_sexual_content 0 1 FALSE 2 0: 1827, 1: 68
30 suggestive_themes 0 1 FALSE 2 0: 1672, 1: 223
31 use_of_alcohol 0 1 FALSE 2 0: 1865, 1: 30
32 use_of_drugs_and_alcohol 0 1 FALSE 2 0: 1865, 1: 30
33 violence 0 1 FALSE 2 0: 1774, 1: 121
34 esrb_rating 0 1 FALSE 4 T: 689, E: 416, ET: 403, M: 387
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 500
Number of columns 34
_______________________
Column type frequency:
factor 34
________________________
Group variables None
── Variable type: factor ─────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate ordered n_unique top_counts
1 title 0 1 FALSE 494 88 : 2, Cit: 2, Eli: 2, The: 2
2 console 0 1 FALSE 2 0: 253, 1: 247
3 alcohol_reference 0 1 FALSE 2 0: 474, 1: 26
4 animated_blood 0 1 FALSE 2 0: 499, 1: 1
5 blood 0 1 FALSE 2 0: 378, 1: 122
6 blood_and_gore 0 1 FALSE 2 0: 434, 1: 66
7 cartoon_violence 0 1 FALSE 2 0: 489, 1: 11
8 crude_humor 0 1 FALSE 2 0: 480, 1: 20
9 drug_reference 0 1 FALSE 2 0: 484, 1: 16
10 fantasy_violence 0 1 FALSE 2 0: 367, 1: 133
11 intense_violence 0 1 FALSE 2 0: 462, 1: 38
12 language 0 1 FALSE 2 0: 435, 1: 65
13 lyrics 0 1 FALSE 2 0: 480, 1: 20
14 mature_humor 0 1 FALSE 1 0: 500
15 mild_blood 0 1 FALSE 2 0: 460, 1: 40
16 mild_cartoon_violence 0 1 FALSE 2 0: 494, 1: 6
17 mild_fantasy_violence 0 1 FALSE 2 0: 471, 1: 29
18 mild_language 0 1 FALSE 2 0: 473, 1: 27
19 mild_lyrics 0 1 FALSE 2 0: 468, 1: 32
20 mild_suggestive_themes 0 1 FALSE 2 0: 480, 1: 20
21 mild_violence 0 1 FALSE 2 0: 486, 1: 14
22 no_descriptors 0 1 FALSE 2 0: 443, 1: 57
23 nudity 0 1 FALSE 2 0: 489, 1: 11
24 partial_nudity 0 1 FALSE 2 0: 471, 1: 29
25 sexual_content 0 1 FALSE 2 0: 490, 1: 10
26 sexual_themes 0 1 FALSE 2 0: 495, 1: 5
27 simulated_gambling 0 1 FALSE 2 0: 489, 1: 11
28 strong_janguage 0 1 FALSE 2 0: 474, 1: 26
29 strong_sexual_content 0 1 FALSE 2 0: 475, 1: 25
30 suggestive_themes 0 1 FALSE 2 0: 462, 1: 38
31 use_of_alcohol 0 1 FALSE 2 0: 457, 1: 43
32 use_of_drugs_and_alcohol 0 1 FALSE 2 0: 475, 1: 25
33 violence 0 1 FALSE 2 0: 346, 1: 154
34 esrb_rating 0 1 FALSE 4 T: 184, ET: 126, E: 100, M: 90
Hmisc::describe(as.factor(train_data$esrb_rating))
as.factor(train_data$esrb_rating)
n missing distinct
1895 0 4
Value E ET M T
Frequency 416 403 387 689
Proportion 0.220 0.213 0.204 0.364
Hmisc::describe(as.factor(test_data$esrb_rating))
as.factor(test_data$esrb_rating)
n missing distinct
500 0 4
Value E ET M T
Frequency 100 126 90 184
Proportion 0.200 0.252 0.180 0.368
prop_train = train_data %>%
group_by(esrb_rating) %>%
tally() %>%
mutate(train=round(n/sum(n),3))
prop_test = test_data %>%
group_by(esrb_rating) %>%
tally() %>%
mutate(test=round(n/sum(n),3))
rat_p = merge(prop_train, prop_test, by="esrb_rating") %>% #merge
mutate(esrb_rating_long = recode(esrb_rating, #rating name
"T"="Teen",
"E" ="Everyone",
"ET" ="Everyone 10+",
"M" ="Mature")) %>%
select(esrb_rating_long, train, test) %>%
pivot_longer(!esrb_rating_long, names_to="data", values_to="pct") %>% #wide to long
# plot
ggplot(aes(x=esrb_rating_long, y=pct, fill= data)) +
geom_col(position="dodge") +
geom_text(aes(label=paste0(pct*100,"%")), position=position_dodge(width=1), vjust=1.5, color="white", size=3.5) +
scale_x_discrete(limits=c("Everyone","Everyone 10+","Teen","Mature")) +
scale_fill_manual(values=c("#30638e","#003d5b")) +
scale_y_continuous(limits=c(0,0.37), labels=scales::percent) +
theme_bw() +
theme(plot.title=element_text(size=11)) +
labs(fill="Data",
y="Percentage",
x="ESRB Rating",
title="Percent distribution of ESRB rating in test and train data")
rat_p
# recode esrb rating to numeric for classification
train_data %>%
mutate(esrb = recode(esrb_rating,
"E" = 0,
"ET"= 1,
"T" =2,
"M" =3)) -> train_data
test_data %>%
mutate(esrb = recode(esrb_rating,
"E" = 0,
"ET"= 1,
"T" =2,
"M" =3)) -> test_data
# function to format matrix
flattenCorrMatrix <- function(cormat, pmat) {
ut <- upper.tri(cormat)
data.frame(
row = rownames(cormat)[row(cormat)[ut]],
column = rownames(cormat)[col(cormat)[ut]],
cor =(cormat)[ut],
p = pmat[ut]
)
}
# train data
train_features = train_data %>% select(where(is.numeric))
res_train = rcorr(as.matrix(train_features), type=c("spearman"))
cor_train = flattenCorrMatrix(res_train$r, res_train$P)
# sig. corr (highest 10)
cor_train %>%
mutate(across(3:4, round,4)) %>%
filter(column=="esrb") %>%
filter(p<0.01) %>%
arrange(desc(cor)) %>%
dplyr::slice(1:10)
# test data
test_features = test_data %>% select(where(is.numeric))
res_test = rcorr(as.matrix(test_features), type=c("spearman"))
cor_test = flattenCorrMatrix(res_test$r, res_test$P)
# sig. corr (highest 10)
cor_test %>%
mutate(across(3:4, round,4)) %>%
filter(column=="esrb") %>%
filter(p<0.05) %>%
arrange(desc(cor)) %>%
dplyr::slice(1:10)
# correlation heatmap
# reference: # http://www.sthda.com/english/wiki/visualize-correlation-matrix-using-correlogram
# function for pvalue
cor.mtest <- function(mat, ...) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], ...)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
# plot train data
c_train = cor(train_features, method=c("spearman"))
p.mat <- cor.mtest(train_features) # pvalue
corrplot(c_train, method="color",order="hclust",
p.mat = p.mat, sig.level = 0.01, insig = "blank",
tl.col="black", tl.cex=0.7,
diag=FALSE
)
train_label = train_data %>% mutate(label= paste(esrb,":",esrb_rating))
pca_train <- PCA(train_features[,-33], graph = FALSE)
fviz_pca_ind(pca_train,
geom.ind = "point", # show points only
col.ind = as.factor(train_label$label), # color by rating
alpha.ind=0.8,
palette = "jco",
#addEllipses = TRUE, # Concentration ellipses
legend.title = "ESRB Rating",
title= "Train data"
)
test_label = test_data %>% mutate(label= paste(esrb,":",esrb_rating))
pca_test <- PCA(test_features[,-33], graph = FALSE)
fviz_pca_ind(pca_test,
geom.ind = "point", # show points only
col.ind = as.factor(test_label$label), # color by rating
alpha.ind=0.8,
palette = "jco",
#addEllipses = TRUE, # Concentration ellipses
legend.title = "ESRB Rating",
title= "Test data"
)
dt = rpart(esrb~., data= train_features, method="class")
fancyRpartPlot(dt)
printcp(dt)
Classification tree:
rpart(formula = esrb ~ ., data = train_features, method = "class")
Variables actually used in tree construction:
[1] blood blood_and_gore fantasy_violence mild_fantasy_violence no_descriptors
[6] strong_janguage suggestive_themes
Root node error: 1206/1895 = 0.63641
n= 1895
CP nsplit rel error xerror xstd
1 0.183250 0 1.00000 1.00000 0.017363
2 0.111940 2 0.63350 0.63350 0.017706
3 0.063018 3 0.52156 0.52156 0.016998
4 0.048922 4 0.45854 0.45854 0.016409
5 0.017413 5 0.40962 0.40962 0.015846
6 0.012438 6 0.39221 0.39221 0.015622
7 0.010000 7 0.37977 0.38640 0.015544
plotcp(dt)
dt$variable.importance
strong_janguage no_descriptors fantasy_violence mild_fantasy_violence blood_and_gore
203.0573083 188.2246279 101.7050377 79.2844795 70.7085658
blood suggestive_themes strong_sexual_content mild_blood lyrics
23.3380036 18.1659637 13.9615640 9.0522808 1.0845351
sexual_content alcohol_reference
0.5422676 0.2711338
# confusion matrix
dt.p = predict(dt, test_features, type = "class")
confusionMatrix(dt.p, as.factor(test_features$esrb))
Confusion Matrix and Statistics
Reference
Prediction 0 1 2 3
0 74 4 3 0
1 0 93 27 1
2 26 29 142 25
3 0 0 12 64
Overall Statistics
Accuracy : 0.746
95% CI : (0.7055, 0.7836)
No Information Rate : 0.368
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.6452
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 0 Class: 1 Class: 2 Class: 3
Sensitivity 0.7400 0.7381 0.7717 0.7111
Specificity 0.9825 0.9251 0.7468 0.9707
Pos Pred Value 0.9136 0.7686 0.6396 0.8421
Neg Pred Value 0.9379 0.9129 0.8489 0.9387
Prevalence 0.2000 0.2520 0.3680 0.1800
Detection Rate 0.1480 0.1860 0.2840 0.1280
Detection Prevalence 0.1620 0.2420 0.4440 0.1520
Balanced Accuracy 0.8613 0.8316 0.7593 0.8409
# AUC
multiclass.roc(response= test_features$esrb, predictor = factor(dt.p, ordered=TRUE), plot=FALSE, print.auc=TRUE)
Call:
multiclass.roc.default(response = test_features$esrb, predictor = factor(dt.p, ordered = TRUE), plot = FALSE, print.auc = TRUE)
Data: factor(dt.p, ordered = TRUE) with 4 levels of test_features$esrb: 0, 1, 2, 3.
Multi-class area under the curve: 0.8634
# pruning
dt_prune=prune(dt, cp=0.029)
fancyRpartPlot(dt_prune)
printcp(dt_prune)
Classification tree:
rpart(formula = esrb ~ ., data = train_features, method = "class")
Variables actually used in tree construction:
[1] blood_and_gore fantasy_violence mild_fantasy_violence no_descriptors strong_janguage
Root node error: 1206/1895 = 0.63641
n= 1895
CP nsplit rel error xerror xstd
1 0.183250 0 1.00000 1.00000 0.017363
2 0.111940 2 0.63350 0.63350 0.017706
3 0.063018 3 0.52156 0.52156 0.016998
4 0.048922 4 0.45854 0.45854 0.016409
5 0.029000 5 0.40962 0.40962 0.015846
dt_prune$variable.importance
strong_janguage no_descriptors fantasy_violence mild_fantasy_violence blood_and_gore
203.057308 188.224628 101.705038 79.284479 70.708566
strong_sexual_content mild_blood
13.961564 9.052281
# confusion matrix
dt_prune.p = predict(dt_prune, test_features, type = "class")
confusionMatrix(dt_prune.p, as.factor(test_features$esrb))
Confusion Matrix and Statistics
Reference
Prediction 0 1 2 3
0 74 4 6 0
1 0 94 27 2
2 26 28 139 24
3 0 0 12 64
Overall Statistics
Accuracy : 0.742
95% CI : (0.7013, 0.7798)
No Information Rate : 0.368
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.6403
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 0 Class: 1 Class: 2 Class: 3
Sensitivity 0.7400 0.7460 0.7554 0.7111
Specificity 0.9750 0.9225 0.7532 0.9707
Pos Pred Value 0.8810 0.7642 0.6406 0.8421
Neg Pred Value 0.9375 0.9151 0.8410 0.9387
Prevalence 0.2000 0.2520 0.3680 0.1800
Detection Rate 0.1480 0.1880 0.2780 0.1280
Detection Prevalence 0.1680 0.2460 0.4340 0.1520
Balanced Accuracy 0.8575 0.8342 0.7543 0.8409
# AUC
multiclass.roc(response= test_features$esrb, predictor = factor(dt_prune.p, ordered=TRUE), plot=FALSE, print.auc=TRUE)
Call:
multiclass.roc.default(response = test_features$esrb, predictor = factor(dt_prune.p, ordered = TRUE), plot = FALSE, print.auc = TRUE)
Data: factor(dt_prune.p, ordered = TRUE) with 4 levels of test_features$esrb: 0, 1, 2, 3.
Multi-class area under the curve: 0.8587
# plot dt variable importance
dtdf1 <- data.frame(imp = dt$variable.importance)
dtdf2 <- dtdf1 %>%
tibble::rownames_to_column() %>%
dplyr::rename("variable" = rowname) %>%
dplyr::arrange(imp) %>%
dplyr::mutate(variable = forcats::fct_inorder(variable))
ggplot2::ggplot(dtdf2) +
geom_point(aes(x = variable, y = imp), show.legend = F, shape=1, size=3) +
coord_flip() +
theme_bw() +
theme(axis.ticks.y = element_blank(),
panel.grid.major.x=element_blank(),
panel.grid.minor.x=element_blank(),
panel.grid=element_line(linetype = "dotted",size=0.7, color="grey"),
plot.title=element_text(hjust=0.5, face="bold"),
plot.margin=ggplot2::margin(10,40,10,30)) +
labs(title="dt")
set.seed(123)
rf2 <- train(factor(esrb) ~., data = train_features, method = "rf",
trControl = trainControl("cv", number = 10),
importance = TRUE)
rf2$bestTune
rf2$finalModel
Call:
randomForest(x = x, y = y, mtry = param$mtry, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 32
OOB estimate of error rate: 14.67%
Confusion matrix:
0 1 2 3 class.error
0 404 10 2 0 0.02884615
1 19 328 55 1 0.18610422
2 7 91 550 41 0.20174165
3 1 0 51 335 0.13436693
# predict
rf2.p <- rf2 %>% predict(test_features)
# accuracy
mean(rf2.p == test_features$esrb)
[1] 0.846
# confusion matrix
confusionMatrix(rf2.p, as.factor(test_features$esrb))
Confusion Matrix and Statistics
Reference
Prediction 0 1 2 3
0 95 4 2 0
1 4 112 19 2
2 1 10 155 27
3 0 0 8 61
Overall Statistics
Accuracy : 0.846
95% CI : (0.8113, 0.8765)
No Information Rate : 0.368
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7872
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 0 Class: 1 Class: 2 Class: 3
Sensitivity 0.9500 0.8889 0.8424 0.6778
Specificity 0.9850 0.9332 0.8797 0.9805
Pos Pred Value 0.9406 0.8175 0.8031 0.8841
Neg Pred Value 0.9875 0.9614 0.9055 0.9327
Prevalence 0.2000 0.2520 0.3680 0.1800
Detection Rate 0.1900 0.2240 0.3100 0.1220
Detection Prevalence 0.2020 0.2740 0.3860 0.1380
Balanced Accuracy 0.9675 0.9110 0.8611 0.8291
# AUC
multiclass.roc(response= test_features$esrb, predictor = factor(rf2.p, ordered=TRUE), plot=FALSE, print.auc=TRUE)
Call:
multiclass.roc.default(response = test_features$esrb, predictor = factor(rf2.p, ordered = TRUE), plot = FALSE, print.auc = TRUE)
Data: factor(rf2.p, ordered = TRUE) with 4 levels of test_features$esrb: 0, 1, 2, 3.
Multi-class area under the curve: 0.9407
# variable importance
varImpPlot(rf2$finalModel, type=2)
#varImp(rf2)
#importance(rf2$finalModel)
set.seed(123)
xgb <- train(factor(esrb) ~., data = train_features, method = "xgbTree",trControl = trainControl("cv", number = 10))
xgb$bestTune
xgb.p = xgb %>% predict(test_features)
mean(xgb.p == test_features$esrb)
[1] 0.828
# confusion matrix
confusionMatrix(xgb.p, as.factor(test_features$esrb))
Confusion Matrix and Statistics
Reference
Prediction 0 1 2 3
0 95 4 1 0
1 4 103 18 0
2 1 19 159 33
3 0 0 6 57
Overall Statistics
Accuracy : 0.828
95% CI : (0.792, 0.8601)
No Information Rate : 0.368
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7605
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 0 Class: 1 Class: 2 Class: 3
Sensitivity 0.9500 0.8175 0.8641 0.6333
Specificity 0.9875 0.9412 0.8323 0.9854
Pos Pred Value 0.9500 0.8240 0.7500 0.9048
Neg Pred Value 0.9875 0.9387 0.9132 0.9245
Prevalence 0.2000 0.2520 0.3680 0.1800
Detection Rate 0.1900 0.2060 0.3180 0.1140
Detection Prevalence 0.2000 0.2500 0.4240 0.1260
Balanced Accuracy 0.9688 0.8793 0.8482 0.8093
# variable importance (pct)
varImp(xgb)
xgbTree variable importance
only 20 most important variables shown (out of 32)