# Installing libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readr)
library(ggplot2)
library(dplyr)
# Reading in Data
all_games <- read_csv("all_games.csv")
## Rows: 18800 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): name, platform, release_date, summary, user_review
## dbl (1): meta_score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Metacritic is one of the leading sources for aggregated reviews on all sorts of media since 1999, though my research will pertain to videogames. It offering a metascore, the weighted average of published critic reviews, and a user score which averages regular users.
The data being used in this research is open source from Kaggle, provided by the user named Deep Contractor. He originally collected the data from the Metacritic website, speficically a section containing the scores of console and PC games released from 1995 to 2021. The dataset contains a total of 18,800 games with six variables, name of the game, its platform, release date, a summary of the game, the games metacritic score, and its user review. For the purposes of my research I will mainly focus on release date, metacritic score and usser review.
# Summarize Data
str(all_games)
## spec_tbl_df [18,800 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ name : chr [1:18800] "The Legend of Zelda: Ocarina of Time" "Tony Hawk's Pro Skater 2" "Grand Theft Auto IV" "SoulCalibur" ...
## $ platform : chr [1:18800] "Nintendo 64" "PlayStation" "PlayStation 3" "Dreamcast" ...
## $ release_date: chr [1:18800] "November 23, 1998" "September 20, 2000" "April 29, 2008" "September 8, 1999" ...
## $ summary : chr [1:18800] "As a young boy, Link is tricked by Ganondorf, the King of the Gerudo Thieves. The evil human uses Link to gain "| __truncated__ "As most major publishers' development efforts shift to any number of next-generation platforms, Tony Hawk 2 wil"| __truncated__ "[Metacritic's 2008 PS3 Game of the Year; Also known as \"GTA IV\"] What does the American Dream mean today? For"| __truncated__ "This is a tale of souls and swords, transcending the world and all its history, told for all eternity... The gr"| __truncated__ ...
## $ meta_score : num [1:18800] 99 98 98 98 98 97 97 97 97 97 ...
## $ user_review : chr [1:18800] "9.1" "7.4" "7.7" "8.4" ...
## - attr(*, "spec")=
## .. cols(
## .. name = col_character(),
## .. platform = col_character(),
## .. release_date = col_character(),
## .. summary = col_character(),
## .. meta_score = col_double(),
## .. user_review = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# Converting metascore from character vector to numeric
head(as.numeric(all_games$user_review, na.rm = TRUE))
## Warning in head(as.numeric(all_games$user_review, na.rm = TRUE)): NAs introduced
## by coercion
## [1] 9.1 7.4 7.7 8.4 7.9 9.1
head(as.numeric(all_games$meta_score))
## [1] 99 98 98 98 98 97
# Mean metascores
mean(all_games$meta_score)
## [1] 70.64888
# Converting user review from character to numeric
class(all_games$user_review) <- "numeric"
## Warning in class(all_games$user_review) <- "numeric": NAs introduced by coercion
str(all_games$user_review)
## num [1:18800] 9.1 7.4 7.7 8.4 7.9 9.1 9.1 8 7.9 8.3 ...
# Mean for user review
mean(all_games$user_review, na.rm = TRUE)
## [1] 6.990846
# Creating data frames for each xbox console and joining them
xbox <- all_games[all_games$platform == 'Xbox',]
xbox_360 <- all_games[all_games$platform == 'Xbox 360',]
xbox_one <- all_games[all_games$platform == 'Xbox One',]
xbox_x <- all_games[all_games$platform == 'Xbox Series X',]
xbox_all <- rbind(xbox, xbox_360, xbox_one, xbox_x)
# Creating data frames for each playstation console and joinging them
playstation <- all_games[all_games$platform == 'PlayStation',]
playstation_2 <- all_games[all_games$platform == 'PlayStation 2',]
playstation_3 <- all_games[all_games$platform == 'PlayStation 3',]
playstation_4 <- all_games[all_games$platform == 'PlayStation 4',]
Playstation_5 <- all_games[all_games$platform == 'PlayStation 5',]
playstation_all <- rbind(playstation, playstation_2, playstation_3, playstation_4, Playstation_5)
# Creating data frames for each nintendo console and joing them
Nintendo64 <- all_games[all_games$platform == 'Nintendo 64',]
DS <-all_games[all_games$platform == 'DS',]
IIIDS <- all_games[all_games$platform == '3DS',]
GameCube <- all_games[all_games$platform == 'GameCube',]
gameboyADV <- all_games[all_games$platform == 'Game Boy Advance',]
Wii <- all_games[all_games$platform == 'Wii',]
WiiU <- all_games[all_games$platform == 'Wii U',]
switch <- all_games[all_games$platform == 'Switch',]
nintendo <- rbind(Nintendo64, DS, IIIDS, GameCube, Wii, WiiU, switch)
# Creating data frame for PC
PC <- all_games[all_games$platform == 'PC',]
# Calculating mean metascore for xbox
mean(xbox_all$meta_score)
## [1] 70.39713
# Calculating mean user review
mean(xbox_all$user_review, na.rm = TRUE)
## [1] 6.817331
# Calculating mean metascore for playstation
mean(playstation_all$meta_score)
## [1] 70.56422
# Calculating mean user review
mean(playstation_all$user_review, na.rm = TRUE)
## [1] 6.936797
# Calculating mean metascore for nintendo
mean(nintendo$meta_score)
## [1] 69.92667
# Calculating mean user review
mean(nintendo$user_review, na.rm = TRUE)
## [1] 7.196874
# Calculating mean metascore for PC
mean(PC$meta_score)
## [1] 71.7979
# Calculating mean user review for PC
mean(PC$user_review, na.rm = TRUE)
## [1] 6.918197
str(all_games$release_date)
## chr [1:18800] "November 23, 1998" "September 20, 2000" "April 29, 2008" ...
# Creating alternate general platform column
all_games$platform2 <- all_games$platform
all_games$platform2[all_games$platform2 == 'Xbox 360'] <- 'Xbox'
all_games$platform2[all_games$platform2 == 'Xbox One'] <- 'Xbox'
all_games$platform2[all_games$platform2 == 'Xbox Series X'] <- 'Xbox'
all_games$platform2[all_games$platform2 == 'PlayStation 2'] <- 'PlayStation'
all_games$platform2[all_games$platform2 == 'PlayStation 3'] <- 'PlayStation'
all_games$platform2[all_games$platform2 == 'PlayStation 4'] <- 'PlayStation'
all_games$platform2[all_games$platform2 == 'PlayStation 5'] <- 'PlayStation'
all_games$platform2[all_games$platform2 == 'PSP'] <- 'Playstation'
all_games$platform2[all_games$platform2 == 'PlayStation Vita'] <- 'PlayStation'
all_games$platform2[all_games$platform2 == 'Nintendo 64'] <- 'Nintendo'
all_games$platform2[all_games$platform2 == 'GameCube'] <- 'Nintendo'
all_games$platform2[all_games$platform2 == 'Wii'] <- 'Nintendo'
all_games$platform2[all_games$platform2 == 'Wii U'] <- 'Nintendo'
all_games$platform2[all_games$platform2 == 'DS'] <- 'Nintendo'
all_games$platform2[all_games$platform2 == 'Game Boy Advance'] <- 'Nintendo'
all_games$platform2[all_games$platform2 == 'Switch'] <- 'Nintendo'
# Converting date nto months and years for easier analysis
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
all_games$month <- ym(format(mdy(all_games$release_date), '%Y-%m'))
# Linear model to show the relationship between meta score and release date
lm(all_games$meta_score ~ poly(all_games$month, 2)) %>% summary()
##
## Call:
## lm(formula = all_games$meta_score ~ poly(all_games$month, 2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.461 -6.940 1.483 8.590 29.035
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.64888 0.08831 799.99 <2e-16 ***
## poly(all_games$month, 2)1 128.15739 12.10873 10.58 <2e-16 ***
## poly(all_games$month, 2)2 192.50871 12.10873 15.90 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.11 on 18797 degrees of freedom
## Multiple R-squared: 0.01904, Adjusted R-squared: 0.01893
## F-statistic: 182.4 on 2 and 18797 DF, p-value: < 2.2e-16
ggsave("metaVStime.pdf", device="pdf", width=8, height=8, units="in",dpi=300)
Release date is a statistically significant predictor for meta_score
# Running anova test to determine platform significance
full_model <- lm(meta_score ~ month + platform2, data = all_games)
reduced_model <- lm(meta_score ~ month, data = all_games)
anova(full_model, reduced_model )
## Analysis of Variance Table
##
## Model 1: meta_score ~ month + platform2
## Model 2: meta_score ~ month
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 18791 2779540
## 2 18798 2793101 -7 -13561 13.097 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggsave("anova.pdf", device="pdf", width=8, height=8, units="in",dpi=300)
Platform is a statistically significant predicor variable for meta_score
# Creating scatterlot to show mean meta score over time
all_games%>%
group_by(month) %>%
summarize(mean_metascore = mean(meta_score)) %>%
ggplot(aes(x = month, y = mean_metascore)) +
geom_point() + geom_abline() +
ggtitle("Meta Score Means Over Time") +
labs(x = 'Year', y = "Mean Meta Score")
ggsave("Meta_means.pdf", device="pdf", width=8, height=8, units="in",dpi=300)
Release date is a statistically significant predictor for meta score - it appears to be a positive quadratic relationship.
# Creating scatterplot to show mean user score over time
all_games%>%
group_by(month) %>%
summarize(mean_userscore = mean(user_review)) %>%
ggplot(aes(x = month, y = mean_userscore)) +
geom_point() +
ggtitle("User Score Means Over Time")
## Warning: Removed 230 rows containing missing values (geom_point).
labs(x = 'Year', y = "Mean User Score")
## $x
## [1] "Year"
##
## $y
## [1] "Mean User Score"
##
## attr(,"class")
## [1] "labels"
ggsave("User_means.pdf", device="pdf", width=8, height=8, units="in",dpi=300)
## Warning: Removed 230 rows containing missing values (geom_point).
There appears to be a lot of missing data, regardless, from what we have it appears to be lower on average than meta_score
As hypothesized, platform was a determinant factor when it came to predicting meta_score. Similarly, reslease date was a statistically significant predictor for metascore, and while reviews did decrease substantially from 1999-2021, my plot show and upwards trend in recent years. Observing the two plots comparatively, there are faint similarities in terms of trend, despite all of the missing user score data. Though, based on what we can observe over time, critic scores tend to be higher. On average, early video games (before 2000) rank extremely high compared to newer games. Why is this? It is likely a result of the exponential boom of the videogame industry at the turn of the century. Not only was there greater competition and saturation of games, but many developers sought to make the cheapest games possible leading to the apparent decline. Although as the plots show, the relationship is quadratic, suggesting a recent improvement in the industry thanks to technological advancements.