Start Date: 2 Sept 2025
Report Date: 30 September 2025
To acknowledge National Video Game Day which is September 12th, this month’s dataset is about video game sales.
The contest will run until September 30th, latest time zone possible. The entry form is available here and the prompts are below.
knitr::opts_chunk$set(echo = T,message = F,warning = F)
#setwd("C:/Users/alanh/Documents/R/Psi_Chi_R")
rm(list=ls())
setwd("~/R/Psi_Chi_R")
library(tidyverse)
#commenting out code but not for text
if (FALSE) {
}
#total for bottom row
sum_rows = function(x) {
x = as.data.frame(x) # Ensure x is a data frame
sums = sapply(x,function(col) if (is.numeric(col)) sum(col, na.rm = T) else NA)
sums = as.data.frame(t(sums)) # Convert to data frame
names(sums) = names(x) # Retain column names
rbind(x, sums) # Bind the sum row to the original data frame
}
# right column for total
sum_cols = function(x) {
x$Total = rowSums(x[sapply(x, is.numeric)], na.rm = T)
x
}
options(scipen=999) # disable scientific notation
#dollar format function
dollars = function(x) {
paste0("$",format(x,big.mark= ",",scientific=F))
}
desc_stats = function(x){
c(min = min(x,na.rm=T),
median = median(x,na.rm=T),
max = max(x,na.rm=T),
mean = mean(x,na.rm=T),
sd = sd(x,na.rm=T))
}
data=read_csv('data.csv',show_col_types = F)
# data = readxl::read_excel('C:/Users/alanh/Downloads/2025Apr_data.xlsx')
names(data) = make.names(colnames(data))
#SmartEDA::ExpData(data,type=2) %>% arrange(desc(Per_of_Missing))
skimr::skim(data)
Name | data |
Number of rows | 64016 |
Number of columns | 13 |
_______________________ | |
Column type frequency: | |
character | 7 |
numeric | 6 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
title | 0 | 1.00 | 1 | 147 | 0 | 39798 | 0 |
console | 0 | 1.00 | 2 | 6 | 0 | 81 | 0 |
genre | 0 | 1.00 | 3 | 16 | 0 | 20 | 0 |
publisher | 0 | 1.00 | 2 | 49 | 0 | 3383 | 0 |
developer | 17 | 1.00 | 2 | 68 | 0 | 8862 | 0 |
release_date | 7051 | 0.89 | 8 | 10 | 0 | 7922 | 0 |
last_update | 46137 | 0.28 | 8 | 10 | 0 | 1545 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
critic_score | 57338 | 0.10 | 7.22 | 1.46 | 1 | 6.40 | 7.50 | 8.30 | 10.00 | ▁▁▃▇▅ |
total_sales | 45094 | 0.30 | 0.35 | 0.81 | 0 | 0.03 | 0.12 | 0.34 | 20.32 | ▇▁▁▁▁ |
na_sales | 51379 | 0.20 | 0.26 | 0.49 | 0 | 0.05 | 0.12 | 0.28 | 9.76 | ▇▁▁▁▁ |
jp_sales | 57290 | 0.11 | 0.10 | 0.17 | 0 | 0.02 | 0.04 | 0.12 | 2.13 | ▇▁▁▁▁ |
pal_sales | 51192 | 0.20 | 0.15 | 0.39 | 0 | 0.01 | 0.04 | 0.14 | 9.85 | ▇▁▁▁▁ |
other_sales | 48888 | 0.24 | 0.04 | 0.13 | 0 | 0.00 | 0.01 | 0.03 | 3.12 | ▇▁▁▁▁ |
ACR_tab = data1 |>
group_by(genre) |>
summarise(critic_ratings_avg = mean(critic_score,na.rm=T),
total_sales_avg = mean(total_sales,na.rm=T))
print(ACR_tab,print=Inf)
## # A tibble: 20 × 3
## genre critic_ratings_avg total_sales_avg
## <chr> <dbl> <dbl>
## 1 Action 6.83 0.784
## 2 Action-Adventure 7.68 1.00
## 3 Adventure 7.34 0.572
## 4 Board Game 5.3 0.3
## 5 Education 7.75 0.305
## 6 Fighting 7.39 0.632
## 7 MMO 7.78 0.29
## 8 Misc 6.88 0.727
## 9 Music 7.99 0.836
## 10 Party 5.97 0.375
## 11 Platform 7.31 0.500
## 12 Puzzle 7.20 0.283
## 13 Racing 7.19 0.746
## 14 Role-Playing 7.47 0.545
## 15 Sandbox 9.2 1.89
## 16 Shooter 7.28 1.18
## 17 Simulation 6.91 0.659
## 18 Sports 7.29 0.902
## 19 Strategy 7.43 0.268
## 20 Visual Novel 8.5 0.03
ATS_tab = data1 |>
group_by(developer) |>
summarise(total_sales_avg = mean(total_sales,na.rm=T))
print(ATS_tab)
## # A tibble: 1,576 × 2
## developer total_sales_avg
## <chr> <dbl>
## 1 .theprodukkt NaN
## 2 10tacle Studios AG 0.16
## 3 11 bit studios NaN
## 4 1C Company NaN
## 5 1C: Maddox Games 0.05
## 6 1C:Ino-Co NaN
## 7 1st Playable Productions 3.14
## 8 2015, Inc. 0
## 9 2D Boy 0.03
## 10 2K Australia / 2K Boston 2.83
## # ℹ 1,566 more rows
data1 |>
group_by(developer,title) |>
summarise(most_sold_game = max(na_sales,na.rm = T)) |>
arrange(desc(most_sold_game)) |>
distinct(developer,.keep_all = T)
data1 |>
filter(release_date2==2018) |>
group_by(title) |>
summarise(highly_sold_game = max(total_sales,na.rm = T)) |>
arrange(desc(highly_sold_game))
# Red Dead Redemption 2
rel_list = list('critic_score','total_sales')
for (i in rel_list){
x=sum(is.na(data1[[i]]))
print(paste0(i,":"," ",x))
}
## [1] "critic_score: 0"
## [1] "total_sales: 2552"
data2 = data1 |>
filter(!is.na(total_sales))
for (i in rel_list){
x=shapiro.test(data2[[i]])
print(i)
print(x)
}
## [1] "critic_score"
##
## Shapiro-Wilk normality test
##
## data: data2[[i]]
## W = 0.95407, p-value < 0.00000000000000022
##
## [1] "total_sales"
##
## Shapiro-Wilk normality test
##
## data: data2[[i]]
## W = 0.47624, p-value < 0.00000000000000022
for (i in rel_list){
hist(data2[[i]],main=i,xlab=i,probability = T)
lines(density(data1[[i]],na.rm = T),lwd=2,col='darkred')
}
if (FALSE) {
hist(data1$total_sales,probability = T)
lines(density(data1$total_sales))
hist(data1$critic_score,probability = T)
lines(density(data1$critic_score))
}
cor_test_list = list('s','k')
for (i in cor_test_list){
x= cor.test(data2$critic_score,data2$total_sales,method = i)
print(x)
}
##
## Spearman's rank correlation rho
##
## data: data2$critic_score and data2$total_sales
## S = 7752046553, p-value < 0.00000000000000022
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.337814
##
##
## Kendall's rank correlation tau
##
## data: data2$critic_score and data2$total_sales
## z = 22.401, p-value < 0.00000000000000022
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.2364818
#yes, the Spearman rank shows a
The Spearman’s showed a significant positive correlation between critic score and total sales, rho = 0.34, p < .001. This is consistent with Kendall’s (tau = 0.24, p < .001), indicating that higher critic scores are correlated with greater sales.
data2 |>
ggplot(aes(x=critic_score,y=total_sales,col=total_sales))+
geom_point()+
theme_bw()+
geom_jitter()+
#geom_smooth()+
labs(title='In the Eyes of the Beholder')+
theme(plot.title = element_text(hjust = .5))+
guides(col='none')
data3 = data2 |>
filter(grepl('ea sports',publisher,ignore.case = T)) |>
mutate(PUBLISHER_2 = case_when(publisher == 'EA Sports BIG' ~ 'EA Sports',TRUE ~ publisher))
data3 |>
group_by(release_date2) |>
summarise(HEADCOUNT =n()) |>
ggplot(aes(x=release_date2,y=HEADCOUNT))+
geom_line(col='darkgreen',lwd=1)+
theme_bw()+
labs(x='Release Year',y='Games Sold',title='The EA Way by EA Games',subtitle='(the way of microtransactions to sell, sell, sell!)') +
geom_text(aes(label = HEADCOUNT,vjust = -.5,hjust = 2),size = 3.5,col='darkred')+
theme(plot.subtitle = element_text(hjust = .5),plot.title = element_text(hjust = .5))
data3 |>
ggplot(aes(x=release_date2))+
geom_bar(fill='darkgreen',col='gold')+
theme_bw()+
labs(x='Release Year',y='Games Sold',title='The EA Way by EA Games',subtitle='(the way of microtransactions to sell, sell, sell!)') +
geom_text(stat='count',aes(label = after_stat(count)),vjust = -0.5,size = 3,col='darkred')+
theme(plot.subtitle = element_text(hjust = .5),plot.title = element_text(hjust = .5))
data4 = data3 |>
select(contains('sale')) |>
na.omit()
cor_frame = cor(data4) |> as.data.frame()
cor_frame
cor_frame2 = cor_frame |>
rownames_to_column('var1') |>
pivot_longer(-var1,names_to = 'var2',values_to = 'corr')
cor_frame2 |>
ggplot(aes(x=var1,y=var2,fill=corr))+
geom_tile()+scale_fill_gradient2(
high = "darkred", low = "lightblue", mid = "gray",midpoint = 0, limit = c(-1, 1), space = "Lab",name = "Correlation") +
theme_bw() +
labs(title='Sales Matrix',y='',x='')+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
plot.title = element_text(hjust = .5))+
geom_text(aes(label = round(corr, 2)))