data <- read.csv("C:\\Users\\gajaw\\OneDrive\\Desktop\\STATS\\vgsales.csv")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Group by Genre and summarize by Global Sales

grouped_df1 <- data %>%
  group_by(Genre) %>%
  summarize(mean_global_sales = mean(Global_Sales, na.rm = TRUE), count = n())

print(grouped_df1)
## # A tibble: 12 × 3
##    Genre        mean_global_sales count
##    <chr>                    <dbl> <int>
##  1 Action                   0.528  3316
##  2 Adventure                0.186  1286
##  3 Fighting                 0.529   848
##  4 Misc                     0.466  1739
##  5 Platform                 0.938   886
##  6 Puzzle                   0.421   582
##  7 Racing                   0.586  1249
##  8 Role-Playing             0.623  1488
##  9 Shooter                  0.792  1310
## 10 Simulation               0.452   867
## 11 Sports                   0.567  2346
## 12 Strategy                 0.257   681

Group by Platform and Summarize by NA_Sales

grouped_df2 <- data %>%
  group_by(Platform) %>%
  summarize(mean_na_sales = mean(NA_Sales, na.rm = TRUE), count = n())

print(grouped_df2)
## # A tibble: 31 × 3
##    Platform mean_na_sales count
##    <chr>            <dbl> <int>
##  1 2600             0.681   133
##  2 3DO              0         3
##  3 3DS              0.155   509
##  4 DC               0.104    52
##  5 DS               0.181  2163
##  6 GB               1.17     98
##  7 GBA              0.228   822
##  8 GC               0.240   556
##  9 GEN              0.714    27
## 10 GG               0         1
## # ℹ 21 more rows

Group Publisher, Summarized by EU_Sales

grouped_df3 <- data %>%
  group_by(Publisher) %>%
  summarize(mean_eu_sales = mean(EU_Sales, na.rm = TRUE), count = n())

print(grouped_df3)
## # A tibble: 579 × 3
##    Publisher                    mean_eu_sales count
##    <chr>                                <dbl> <int>
##  1 10TACLE Studios                     0.0133     3
##  2 1C Company                          0.0233     3
##  3 20th Century Fox Video Games        0.02       5
##  4 2D Boy                              0.03       1
##  5 3DO                                 0.0844    36
##  6 49Games                             0.04       1
##  7 505 Games                           0.0856   192
##  8 5pb                                 0         61
##  9 7G//AMES                            0.015      4
## 10 989 Sports                          0.14       1
## # ℹ 569 more rows

Adding a Special tag to the lowest probability group in each dataframe

grouped_df1$group_tag <- ifelse(grouped_df1$count == min(grouped_df1$count), "Lowest Probability", "")
grouped_df2$group_tag <- ifelse(grouped_df2$count == min(grouped_df2$count), "Lowest Probability", "")
grouped_df3$group_tag <- ifelse(grouped_df3$count == min(grouped_df3$count), "Lowest Probability", "")
print(grouped_df1)
## # A tibble: 12 × 4
##    Genre        mean_global_sales count group_tag           
##    <chr>                    <dbl> <int> <chr>               
##  1 Action                   0.528  3316 ""                  
##  2 Adventure                0.186  1286 ""                  
##  3 Fighting                 0.529   848 ""                  
##  4 Misc                     0.466  1739 ""                  
##  5 Platform                 0.938   886 ""                  
##  6 Puzzle                   0.421   582 "Lowest Probability"
##  7 Racing                   0.586  1249 ""                  
##  8 Role-Playing             0.623  1488 ""                  
##  9 Shooter                  0.792  1310 ""                  
## 10 Simulation               0.452   867 ""                  
## 11 Sports                   0.567  2346 ""                  
## 12 Strategy                 0.257   681 ""
print(grouped_df2)
## # A tibble: 31 × 4
##    Platform mean_na_sales count group_tag           
##    <chr>            <dbl> <int> <chr>               
##  1 2600             0.681   133 ""                  
##  2 3DO              0         3 ""                  
##  3 3DS              0.155   509 ""                  
##  4 DC               0.104    52 ""                  
##  5 DS               0.181  2163 ""                  
##  6 GB               1.17     98 ""                  
##  7 GBA              0.228   822 ""                  
##  8 GC               0.240   556 ""                  
##  9 GEN              0.714    27 ""                  
## 10 GG               0         1 "Lowest Probability"
## # ℹ 21 more rows
print(grouped_df3)
## # A tibble: 579 × 4
##    Publisher                    mean_eu_sales count group_tag           
##    <chr>                                <dbl> <int> <chr>               
##  1 10TACLE Studios                     0.0133     3 ""                  
##  2 1C Company                          0.0233     3 ""                  
##  3 20th Century Fox Video Games        0.02       5 ""                  
##  4 2D Boy                              0.03       1 "Lowest Probability"
##  5 3DO                                 0.0844    36 ""                  
##  6 49Games                             0.04       1 "Lowest Probability"
##  7 505 Games                           0.0856   192 ""                  
##  8 5pb                                 0         61 ""                  
##  9 7G//AMES                            0.015      4 ""                  
## 10 989 Sports                          0.14       1 "Lowest Probability"
## # ℹ 569 more rows

Assigning special tags to the lowest probability group

data_tagged <- data %>% 
  left_join(grouped_df1 %>% select(Genre, group_tag), by = "Genre") %>%
  left_join(grouped_df2 %>% select(Platform, group_tag), by = "Platform") %>%
  left_join(grouped_df3 %>% select(Publisher, group_tag), by = "Publisher")
print(head(data_tagged, 10))
##    Rank                      Name Platform Year        Genre Publisher NA_Sales
## 1     1                Wii Sports      Wii 2006       Sports  Nintendo    41.49
## 2     2         Super Mario Bros.      NES 1985     Platform  Nintendo    29.08
## 3     3            Mario Kart Wii      Wii 2008       Racing  Nintendo    15.85
## 4     4         Wii Sports Resort      Wii 2009       Sports  Nintendo    15.75
## 5     5  Pokemon Red/Pokemon Blue       GB 1996 Role-Playing  Nintendo    11.27
## 6     6                    Tetris       GB 1989       Puzzle  Nintendo    23.20
## 7     7     New Super Mario Bros.       DS 2006     Platform  Nintendo    11.38
## 8     8                  Wii Play      Wii 2006         Misc  Nintendo    14.03
## 9     9 New Super Mario Bros. Wii      Wii 2009     Platform  Nintendo    14.59
## 10   10                 Duck Hunt      NES 1984      Shooter  Nintendo    26.93
##    EU_Sales JP_Sales Other_Sales Global_Sales        group_tag.x group_tag.y
## 1     29.02     3.77        8.46        82.74                               
## 2      3.58     6.81        0.77        40.24                               
## 3     12.88     3.79        3.31        35.82                               
## 4     11.01     3.28        2.96        33.00                               
## 5      8.89    10.22        1.00        31.37                               
## 6      2.26     4.22        0.58        30.26 Lowest Probability            
## 7      9.23     6.50        2.90        30.01                               
## 8      9.20     2.93        2.85        29.02                               
## 9      7.06     4.70        2.26        28.62                               
## 10     0.63     0.28        0.47        28.31                               
##    group_tag
## 1           
## 2           
## 3           
## 4           
## 5           
## 6           
## 7           
## 8           
## 9           
## 10

Visualizations

Bar plot for Grouped DataFrame 1 : Global Sales by Genre

barplot(grouped_df1$mean_global_sales, 
        names.arg = grouped_df1$Genre, 
        main = "Mean Global Sales by Genre", 
        col = "skyblue", 
        las = 2,  
        cex.names = 0.8)

Insights:

Significance:

Further Questions:

Scatterplot for Grouped Dataframe 2: NA_Sales by Platform

plot(grouped_df2$count, 
     grouped_df2$mean_na_sales, 
     main = "Mean NA Sales by Number of Games per Platform", 
     xlab = "Number of Games (Count)", 
     ylab = "Mean NA Sales", 
     col = "green", 
     pch = 16,  
     cex = 1.2) 

Insights:

Significance:

Further Questions:

Barplot for Grouped Dataframe 3: EU Sales by Publisher

boxplot(mean_eu_sales ~ Publisher, 
        data = grouped_df3, 
        main = "Mean EU Sales Distribution by Publisher", 
        xlab = "Publisher", 
        ylab = "Mean EU Sales", 
        col = "lightpink", 
        las = 2,
        cex.axis = 0.7)  

Insights:

Significance:

Further Questions:

Testing Hypothesis for Dataframes

Dataframe 1: Grouped by Genre
Hypothesis: Production costs, developer specialisation, and differing degrees of customer demand may all have an impact on how uncommon a certain game genre becomes.

Observations: Certain genres are less commonly represented than others in video games, according to an analysis of the distribution of these genres. This suggests a possible link between genre rarity and several influencing factors. Further investigation could explore

  • Certain genres may attract a smaller audience, leading to lower production volumes.

  • Some genres might be more expensive or technically challenging to develop, discouraging production.

Game developers and studios might focus on specific genres based on their expertise or past success, resulting in fewer games in less familiar genres.

Dataframe 2: Grouped by Platform

Hypothesis: Platform popularity, market share, or developer exclusivity agreements may be the main factors influencing a game’s rarity on a certain platform.

Observation: Examining how games are distributed among various platforms reveals that some have a notably less selection of games than others. This may be related to a number of things, including:

  • Due to lower potential sales, platforms with a smaller user base might draw fewer developers.

  • Due to exclusive contracts with publishers or developers, certain platforms may restrict the number of games that are available.

  • Platforms that are less appealing for game creation may be differentiated by differences in hardware capabilities or development tools.

Dataframe 3: Grouped by and EU sales and Publisher

Hypothesis: Market conditions, budgetary restraints, or shifts in corporate strategy may have an impact on the scarcity of games released by particular publishers in particular years.

Observations: Certain combinations are less prevalent when the quantity of games published by publishers throughout various years is analysed. This may point to a number of influential factors, including:

  • A decline in the number of games launched in a given year could be caused by changes in market trends or economic downturns.

  • Publishers may experience financial setbacks that restrict their ability to deliver games at particular times.

  • The frequency of game publication may be affected by changes in company strategy, such as concentrating on different game genres or limiting the number of releases.

    Building a data frame for two Categorical Columns and their combinations

    unique_platforms <- unique(data$Platform)
    unique_genres <- unique(data$Genre)
    
    all_combinations <- expand.grid(Platform = unique_platforms, Genre = unique_genres)
    
    print(head(all_combinations,25))
    ##    Platform  Genre
    ## 1       Wii Sports
    ## 2       NES Sports
    ## 3        GB Sports
    ## 4        DS Sports
    ## 5      X360 Sports
    ## 6       PS3 Sports
    ## 7       PS2 Sports
    ## 8      SNES Sports
    ## 9       GBA Sports
    ## 10      3DS Sports
    ## 11      PS4 Sports
    ## 12      N64 Sports
    ## 13       PS Sports
    ## 14       XB Sports
    ## 15       PC Sports
    ## 16     2600 Sports
    ## 17      PSP Sports
    ## 18     XOne Sports
    ## 19       GC Sports
    ## 20     WiiU Sports
    ## 21      GEN Sports
    ## 22       DC Sports
    ## 23      PSV Sports
    ## 24      SAT Sports
    ## 25      SCD Sports

    Finding missing combination

    actual_combinations <- data %>%
      select(Platform, Genre) %>%
      distinct()
    
    missing_combinations <- anti_join(all_combinations, actual_combinations, by = c("Platform", "Genre"))
    
    print(head(missing_combinations,25))
    ##    Platform        Genre
    ## 1       SCD       Sports
    ## 2        WS       Sports
    ## 3      TG16       Sports
    ## 4       3DO       Sports
    ## 5        GG       Sports
    ## 6      PCFX       Sports
    ## 7        WS     Platform
    ## 8        NG     Platform
    ## 9      TG16     Platform
    ## 10      3DO     Platform
    ## 11     PCFX     Platform
    ## 12       WS       Racing
    ## 13       NG       Racing
    ## 14     TG16       Racing
    ## 15      3DO       Racing
    ## 16       GG       Racing
    ## 17     PCFX       Racing
    ## 18     2600 Role-Playing
    ## 19       NG Role-Playing
    ## 20     TG16 Role-Playing
    ## 21      3DO Role-Playing
    ## 22       GG Role-Playing
    ## 23     XOne       Puzzle
    ## 24      GEN       Puzzle
    ## 25       DC       Puzzle

    Why Might These Combinations Be Missing?

  • Owing to hardware or technical issues, some platforms might not be able to support a particular genre.

  • On some platforms, some genres can not be well-liked or in high demand.

  • Game publishers and developers frequently concentrate on particular platforms where they anticipate the greatest levels of interaction or sales.


    Counting the frequency of the combination

    combination_counts <- data %>%
      group_by(Platform, Genre) %>%
      summarize(count = n()) %>%
      arrange(desc(count))
    ## `summarise()` has grouped output by 'Platform'. You can override using the
    ## `.groups` argument.
    print(head(combination_counts,25))
    ## # A tibble: 25 × 3
    ## # Groups:   Platform [8]
    ##    Platform Genre      count
    ##    <chr>    <chr>      <int>
    ##  1 PS2      Sports       400
    ##  2 DS       Misc         393
    ##  3 PS3      Action       380
    ##  4 PS2      Action       348
    ##  5 DS       Action       343
    ##  6 X360     Action       324
    ##  7 DS       Simulation   285
    ##  8 Wii      Misc         280
    ##  9 Wii      Sports       261
    ## 10 DS       Adventure    240
    ## # ℹ 15 more rows


    Find Most and Least Common Combinations:

    most_common <- combination_counts %>% filter(count == max(count))
    
    least_common <- combination_counts %>% filter(count == min(count))
    
    print("Most common combinations:")
    ## [1] "Most common combinations:"
    print(head(most_common,15))
    ## # A tibble: 15 × 3
    ## # Groups:   Platform [15]
    ##    Platform Genre    count
    ##    <chr>    <chr>    <int>
    ##  1 PS2      Sports     400
    ##  2 DS       Misc       393
    ##  3 PS3      Action     380
    ##  4 X360     Action     324
    ##  5 Wii      Misc       280
    ##  6 PS       Sports     222
    ##  7 PSP      Action     222
    ##  8 PC       Strategy   188
    ##  9 3DS      Action     182
    ## 10 XB       Sports     170
    ## 11 GBA      Action     167
    ## 12 PSV      Action     142
    ## 13 PS4      Action     122
    ## 14 GC       Sports     110
    ## 15 N64      Sports      80
    print("Least common combinations:")
    ## [1] "Least common combinations:"
    print(head(least_common,15))
    ## # A tibble: 15 × 3
    ## # Groups:   Platform [14]
    ##    Platform Genre      count
    ##    <chr>    <chr>      <int>
    ##  1 DS       Fighting      36
    ##  2 PS       Puzzle        32
    ##  3 PSP      Simulation    29
    ##  4 Wii      Strategy      25
    ##  5 GBA      Simulation    18
    ##  6 GBA      Strategy      18
    ##  7 PS2      Puzzle        18
    ##  8 GC       Strategy      11
    ##  9 3DS      Shooter        7
    ## 10 X360     Puzzle         7
    ## 11 XB       Puzzle         7
    ## 12 PC       Fighting       6
    ## 13 N64      Adventure      4
    ## 14 SNES     Adventure      4
    ## 15 PS3      Puzzle         3

Most Common Combinations:

  • Games from many different genres can be found on popular platforms like PlayStation and Xbox, which have big user bases. This leads to high counts for popular combinations, such as “Platformer” or “Action” games on PlayStation.

  • Some genres, such as “Action,” “Adventure,” or “Sports,” have a tendency to be more well-liked on several platforms, which results in greater tallies.

    Least Common Combinations:

  • Because of hardware limitations, some systems might not be able to support certain genres as well. For instance, compared to PCs, “Simulation” and “Strategy” games may be less popular on consoles.

  • Some publishers or developers might concentrate on particular genres or platforms, making other pairings uncommon or nonexistent.

    Visualization for the Game Combinations by Platform and Genre

library(ggplot2)
ggplot(combination_counts, aes(x = Platform, y = count, fill = Genre)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Bar Graph of Game Combinations by Platform and Genre", 
       x = "Platform", 
       y = "Frequency of Combinations", 
       fill = "Genre") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Insight :

  • PS2, PS3, PS4, Wii, DS, and X360 account for over 60% of all genre-platform combinations, indicating their broad appeal and game diversity.

  • Top Genres: Action, Adventure, Sports, and Role-Playing games together make up more than 70% of all combinations, highlighting their popularity across multiple platforms.

  • Specialized Platforms: Wii and DS dominate Puzzle and Sports genres, contributing to over 40% of the games in these genres, suggesting a focus on family and casual gaming.

  • Underrepresented Platforms: Platforms like PCFX and Neo Geo (NG) have less than 1% of all combinations, indicating limited market reach or developer interest.

Significance :

  • Developers can use this data to decide which platforms to release their games on, based on the popularity of certain genres.

  • Console makers can use these insights to attract more developers or to focus on certain genres that are popular on their platform.

Further Questions :

  • Why are some genres less common on certain platforms?

  • What factors drive the popularity of certain genres on specific platforms?

  • How do market trends or technological constraints influence these patterns?