Video Game Sales Analysis Based on Genre

Chi-Square Test

Kishan Manojbhai Ambani (S3957794) and Naitik Digant Mandalia (S3957789)

Last updated: 01 June, 2023

Introduction

Introduction Cont.

Problem Statement

Data

Imported Data

game_sales <- read.csv('vgsales.csv')
knitr::kable(head(game_sales,5))
Rank Name Platform Year Genre Publisher NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales
1 Wii Sports Wii 2006 Sports Nintendo 41.49 29.02 3.77 8.46 82.74
2 Super Mario Bros. NES 1985 Platform Nintendo 29.08 3.58 6.81 0.77 40.24
3 Mario Kart Wii Wii 2008 Racing Nintendo 15.85 12.88 3.79 3.31 35.82
4 Wii Sports Resort Wii 2009 Sports Nintendo 15.75 11.01 3.28 2.96 33.00
5 Pokemon Red/Pokemon Blue GB 1996 Role-Playing Nintendo 11.27 8.89 10.22 1.00 31.37

Data Cleaning

Data After Cleaning

knitr::kable(head(df,5))
Genre NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales
Sports 41.49 29.02 3.77 8.46 82.74
Platform 29.08 3.58 6.81 0.77 40.24
Racing 15.85 12.88 3.79 3.31 35.82
Sports 15.75 11.01 3.28 2.96 33.00
Role-Playing 11.27 8.89 10.22 1.00 31.37

Descriptive Statistics

df_description <- group_by(df, Genre) %>% summarise_at(vars('NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'),
                        list(Mean = mean, 
                             Median = median, 
                             Standard_Deviation=sd, 
                             Minimum = min, 
                             Maximum = max, 
                             Interquartile_Range=IQR, 
                             Quantile_1 = ~quantile(.,probs=0.25),
                             Quantile_3 = ~quantile(.,probs=0.75)))
datalong <- gather(df_description, 'Variable', 'Value', c('NA_Sales_Mean':'Global_Sales_Quantile_3'), factor_key = TRUE)
final <- spread(datalong, Genre, Value)
final$'Summary Statistics' <- sub(".*_","", final$Variable)
final$Variable <- sub("_.*","", final$Variable)
final$`Summary Statistics`[final$`Summary Statistics` == '1'] <- 'Quartile 1'
final$`Summary Statistics`[final$`Summary Statistics` == '3'] <- 'Quartile 3'

temp_final <- final
temp_final <- temp_final[,c(1, 14, 2,3,4,5,6,7,8,9,10,11,12,13)]
counter=1
for (i in 1:nrow(final)){
  if (i %in% c(1,6,11,16,21,26,31,36,41)){
    next
  }
  else{
    temp_final$`Summary Statistics`[i] <- ''
  }
}
knitr::kable(temp_final)
Variable Summary Statistics Action Adventure Fighting Misc Platform Puzzle Racing Role-Playing Shooter Simulation Sports Strategy
NA Mean 0.2647256 0.0822706 0.2636675 0.2359057 0.5045711 0.2126804 0.2877662 0.2199462 0.4447328 0.2114302 0.2912830 0.1008811
EU 0.1583233 0.0498678 0.1194811 0.1241978 0.2275734 0.0872509 0.1908647 0.1263844 0.2391374 0.1307728 0.1606351 0.0665786
JP 0.0482358 0.0404899 0.1030071 0.0619666 0.1475959 0.0984708 0.0453883 0.2367675 0.0292214 0.0734717 0.0577025 0.0726285
Other 0.0565078 0.0130715 0.0432547 0.0433122 0.0582280 0.0215636 0.0618655 0.0400605 0.0783893 0.0363552 0.0575320 0.0166814
Global 0.5281001 0.1858787 0.5293750 0.4657619 0.9383409 0.4208763 0.5861009 0.6232325 0.7918855 0.4523645 0.5673188 0.2571512
NA Median 0.1000000 0.0000000 0.0800000 0.0800000 0.1400000 0.0500000 0.1000000 0.0400000 0.1200000 0.0700000 0.1100000 0.0000000
EU 0.0300000 0.0000000 0.0300000 0.0100000 0.0500000 0.0100000 0.0400000 0.0100000 0.0500000 0.0100000 0.0200000 0.0100000
JP 0.0000000 0.0100000 0.0100000 0.0000000 0.0000000 0.0000000 0.0000000 0.0500000 0.0000000 0.0000000 0.0000000 0.0000000
Other 0.0100000 0.0000000 0.0100000 0.0100000 0.0100000 0.0100000 0.0100000 0.0100000 0.0200000 0.0100000 0.0100000 0.0000000
Global 0.1900000 0.0600000 0.2100000 0.1600000 0.2800000 0.1100000 0.1900000 0.1850000 0.2300000 0.1600000 0.2200000 0.0900000
NA Deviation 0.5668896 0.2746736 0.5161483 0.6908782 1.5020388 1.0576687 0.7425235 0.6727209 1.2011471 0.4666984 1.0410239 0.2989562
EU 0.4074241 0.1764542 0.2530129 0.4465925 0.5885448 0.3185499 0.5811809 0.4888644 0.5491891 0.5140971 0.7767721 0.1925942
JP 0.1648351 0.1200110 0.2589080 0.2456021 0.5252417 0.3647453 0.2666801 0.6474546 0.1023976 0.3033931 0.2236471 0.1714600
Other 0.2397537 0.0447037 0.1055576 0.1435152 0.1661512 0.0708551 0.2711354 0.1251648 0.1995107 0.1171332 0.2438919 0.0412150
Global 1.1564272 0.5132800 0.9559647 1.3148859 2.5852543 1.5617163 1.6624370 1.7079089 1.8172633 1.1952546 2.0897159 0.5209082
NA Minimum 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
EU 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
JP 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
Other 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
Global 0.0100000 0.0100000 0.0100000 0.0100000 0.0100000 0.0100000 0.0100000 0.0100000 0.0100000 0.0100000 0.0100000 0.0100000
NA Maximum 9.6300000 6.1600000 6.7500000 14.9700000 29.0800000 23.2000000 15.8500000 11.2700000 26.9300000 9.0700000 41.4900000 3.1800000
EU 9.2700000 2.7900000 2.6100000 9.2600000 9.2300000 5.3600000 12.8800000 8.8900000 5.8800000 11.0000000 29.0200000 2.2700000
JP 3.9600000 2.6900000 2.8700000 4.1600000 6.8100000 5.3200000 4.1300000 10.2200000 1.4400000 5.3300000 3.7700000 1.6100000
Other 10.5700000 0.8300000 1.4100000 2.8500000 2.9000000 1.1800000 7.5300000 1.7400000 2.5200000 2.7500000 8.4600000 0.5900000
Global 21.4000000 11.1800000 13.0400000 29.0200000 40.2400000 30.2600000 35.8200000 31.3700000 28.3100000 24.7600000 82.7400000 5.4500000
NA Range 0.2400000 0.0700000 0.2900000 0.2100000 0.3900000 0.1300000 0.2400000 0.1600000 0.3400000 0.2400000 0.2800000 0.0800000
EU 0.1500000 0.0300000 0.1200000 0.0850000 0.1800000 0.0500000 0.1500000 0.0700000 0.2200000 0.0700000 0.1100000 0.0400000
JP 0.0300000 0.0400000 0.1000000 0.0200000 0.0400000 0.0200000 0.0000000 0.1900000 0.0100000 0.0200000 0.0100000 0.0600000
Other 0.0500000 0.0100000 0.0400000 0.0300000 0.0500000 0.0100000 0.0500000 0.0300000 0.0700000 0.0300000 0.0400000 0.0100000
Global 0.4300000 0.1400000 0.4700000 0.3500000 0.7000000 0.2675000 0.4600000 0.4525000 0.6475000 0.3700000 0.4700000 0.2300000
NA Quartile 1 0.0200000 0.0000000 0.0000000 0.0000000 0.0500000 0.0100000 0.0300000 0.0000000 0.0300000 0.0000000 0.0200000 0.0000000
EU 0.0000000 0.0000000 0.0000000 0.0000000 0.0100000 0.0000000 0.0100000 0.0000000 0.0100000 0.0000000 0.0000000 0.0000000
JP 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
Other 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
Global 0.0700000 0.0200000 0.0800000 0.0600000 0.0900000 0.0400000 0.0700000 0.0700000 0.0800000 0.0500000 0.0900000 0.0400000
NA Quartile 3 0.2600000 0.0700000 0.2900000 0.2100000 0.4400000 0.1400000 0.2700000 0.1600000 0.3700000 0.2400000 0.3000000 0.0800000
EU 0.1500000 0.0300000 0.1200000 0.0850000 0.1900000 0.0500000 0.1600000 0.0700000 0.2300000 0.0700000 0.1100000 0.0400000
JP 0.0300000 0.0400000 0.1000000 0.0200000 0.0400000 0.0200000 0.0000000 0.1900000 0.0100000 0.0200000 0.0100000 0.0600000
Other 0.0500000 0.0100000 0.0400000 0.0300000 0.0500000 0.0100000 0.0500000 0.0300000 0.0700000 0.0300000 0.0400000 0.0100000
Global 0.5000000 0.1600000 0.5500000 0.4100000 0.7900000 0.3075000 0.5300000 0.5225000 0.7275000 0.4200000 0.5600000 0.2700000

Decsriptive Statistics Cont.

ggplot(df, aes(x = Genre, y = NA_Sales)) +
  geom_boxplot() +
  xlab("Genre") +
  ylab("North America Sales") +
  ggtitle("Distribution of Sale data of Different Genres in North America") 

ggplot(df, aes(x = Genre, y = EU_Sales)) +
  geom_boxplot() +
  xlab("Genre") +
  ylab("Europe Sales") +
  ggtitle("Distribution of Sale data of Different Genres in Europe")

ggplot(df, aes(x = Genre, y = JP_Sales)) +
  geom_boxplot() +
  xlab("Genre") +
  ylab("Japan Sales") +
  ggtitle("Distribution of Sale data of Different Genres in Japan")

ggplot(df, aes(x = Genre, y = Other_Sales)) +
  geom_boxplot() +
  xlab("Genre") +
  ylab("Other Sales") +
  ggtitle("Distribution of Sale data of Different Genres in Other Regions")

ggplot(df, aes(x = Genre, y = Global_Sales)) +
  geom_boxplot() +
  xlab("Genre") +
  ylab("Global Sales") +
  ggtitle("Distribution of Sale data of Different Genres at Global Level")

custom_colors <- c("#FF0000", "#00FF00", "#0000FF", "#FFFF00", "#00FFFF", "#FF00FF", "#800000", "#008000", "#000080", 
                   "#808000", "#800080", "#008080", "#808080")

# Create a mosaic plot with custom colors
ggplot(df, aes(x = "", fill = Genre)) +
  geom_bar(width = 1, position = "fill") +
  coord_flip() +
  scale_fill_manual(values = custom_colors) +
  xlab("") +
  ylab("Proportion") +
  ggtitle("Sale Distribution With Respect To Genre")

Hypothesis Testing

chiSquare_func <- function(input_table, col_name){
  # Perform chi-square test of association with expected frequencies
  result <- chisq.test(input_table)
  
  # Calculate expected frequencies
  expected <- result$expected
  
  # Calculate chi-square test statistic
  chi_square <- sum((input_table - expected)^2 / expected)
  
  # Calculate degrees of freedom
  df_stat <- (nrow(input_table) - 1) * (ncol(input_table) - 1)
  
  # Calculate p-value
  p_value <- 1 - pchisq(chi_square, df_stat)
  
  # Print the result
  cat("The Chi-Square Analysis for :- ", col_name, '\n')
  cat("Chi-square test statistic:", chi_square, "\n")
  cat("Degrees of freedom:", df_stat, "\n")
  cat("p-value:", p_value, "\n")
}

table_data <- table(df$Genre, cut(df$NA_Sales, breaks = c(-Inf, 5, 10, 15, Inf), labels = c("Low", "Medium", "High", "Very High")))
chiSquare_func(table_data, "NA_Sales")
## The Chi-Square Analysis for :-  NA_Sales 
## Chi-square test statistic: 102.2899 
## Degrees of freedom: 33 
## p-value: 5.142919e-09
table_data <- table(df$Genre, cut(df$EU_Sales, breaks = c(-Inf, 3, 5, 8, Inf), labels = c("Low", "Medium", "High", "Very High")))
chiSquare_func(table_data, 'EU_Sales')
## The Chi-Square Analysis for :-  EU_Sales 
## Chi-square test statistic: 49.55325 
## Degrees of freedom: 33 
## p-value: 0.03212222
table_data <- table(df$Genre, cut(df$JP_Sales, breaks = c(-Inf, 0.5, 1, 1.5, Inf), labels = c("Low", "Medium", "High", "Very High")))
chiSquare_func(table_data, 'JP_Sales')
## The Chi-Square Analysis for :-  JP_Sales 
## Chi-square test statistic: 401.2928 
## Degrees of freedom: 33 
## p-value: 0
table_data <- table(df$Genre, cut(df$Other_Sales, breaks = c(-Inf, 1, 1.5, 2.5, Inf), labels = c("Low", "Medium", "High", "Very High")))
chiSquare_func(table_data, 'Other_Sales')
## The Chi-Square Analysis for :-  Other_Sales 
## Chi-square test statistic: 42.72626 
## Degrees of freedom: 33 
## p-value: 0.1196447
table_data <- table(df$Genre, cut(df$Global_Sales, breaks = c(-Inf, 10, 20, 30, Inf), labels = c("Low", "Medium", "High", "Very High")))
chiSquare_func(table_data, 'Global__Sales')
## The Chi-Square Analysis for :-  Global__Sales 
## Chi-square test statistic: 68.15063 
## Degrees of freedom: 33 
## p-value: 0.0003062696

Hypthesis Testing Cont.

\[\chi^2 = \sum \frac{{(O_{ij} - E_{ij})^2}}{{E_{ij}}} \] + Degree of Freedom :- \[df = (r - 1) * (c - 1)\]

Discussion

References