data <- read.csv("C:\\Users\\gajaw\\OneDrive\\Desktop\\STATS\\vgsales.csv")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Create 5 random samples with replacement
set.seed(123)  # Setting seed for reproducibility
df_1 <- data[sample(nrow(data), replace = TRUE, size = round(0.5 * nrow(data))), ]
df_2 <- data[sample(nrow(data), replace = TRUE, size = round(0.5 * nrow(data))), ]
df_3 <- data[sample(nrow(data), replace = TRUE, size = round(0.5 * nrow(data))), ]
df_4 <- data[sample(nrow(data), replace = TRUE, size = round(0.5 * nrow(data))), ]
df_5 <- data[sample(nrow(data), replace = TRUE, size = round(0.5 * nrow(data))), ]
head(df_1)
##        Rank                              Name Platform Year     Genre
## 2986   2987 Prince of Persia: The Two Thrones      PS2 2005    Action
## 1842   1843        NBA Jam Tournament Edition      GEN 1994    Sports
## 3371   3372            Xena: Warrior Princess       PS 1998 Adventure
## 11638 11639      Daniel X: The Ultimate Power       DS 2010    Action
## 4761   4762                       TNA iMPACT!      PS3 2008  Fighting
## 6746   6747                    Cradle of Rome       DS 2008    Puzzle
##                   Publisher NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales
## 2986                Ubisoft     0.57     0.02        0        0.09         0.68
## 1842  Acclaim Entertainment     0.95     0.14        0        0.03         1.11
## 3371        Electronic Arts     0.33     0.23        0        0.04         0.60
## 11638                   THQ     0.07     0.00        0        0.01         0.08
## 4761           Midway Games     0.22     0.12        0        0.06         0.41
## 6746      Rising Star Games     0.06     0.16        0        0.03         0.25
head(df_2)
##        Rank                                          Name Platform Year
## 8086   8087  Battle Commander: Hachibushu Shura no Heihou     SNES 1991
## 1964   1965                  Apollo Justice: Ace Attorney       DS 2007
## 11691 11692                            Storybook Workshop      Wii 2009
## 3638   3639 Harry Potter and the Deathly Hallows - Part 1      Wii 2010
## 6793   6794                                 Madden NFL 13     WiiU 2012
## 12343 12344                  Arc the Lad: End of Darkness      PS2 2004
##              Genre                    Publisher NA_Sales EU_Sales JP_Sales
## 8086      Strategy                    Banpresto     0.00     0.00     0.18
## 1964     Adventure                       Capcom     0.32     0.06     0.64
## 11691         Misc Konami Digital Entertainment     0.07     0.00     0.00
## 3638        Action              Electronic Arts     0.22     0.28     0.00
## 6793        Sports              Electronic Arts     0.22     0.00     0.00
## 12343 Role-Playing  Sony Computer Entertainment     0.03     0.02     0.00
##       Other_Sales Global_Sales
## 8086         0.00         0.18
## 1964         0.04         1.06
## 11691        0.01         0.08
## 3638         0.05         0.55
## 6793         0.02         0.24
## 12343        0.01         0.06
head(df_3)
##        Rank
## 5129   5130
## 6834   6835
## 15748 15750
## 14239 14241
## 10627 10628
## 313     313
##                                                                                                                             Name
## 5129                                                                                                                 Custom Robo
## 6834                                                                                                    Super Monkey Ball Deluxe
## 15748 Kochira Katsushikaku Kameari Kouenmae Hashutsujo: Machiteba Tengoku! Makereba Jigoku! Ryoutsuryuu Ikkakusenkin Daisakusen!
## 14239                                                                                      Prince of Persia: The Forgotten Sands
## 10627                                                                                                           Nodame Cantabile
## 313                                                                                                           Dragon Warrior III
##       Platform Year        Genre          Publisher NA_Sales EU_Sales JP_Sales
## 5129        GC 2004 Role-Playing           Nintendo     0.29     0.07     0.00
## 6834       PS2 2005         Misc               Sega     0.12     0.09     0.00
## 15748       DS 2010         Misc Namco Bandai Games     0.00     0.00     0.02
## 14239       PC 2010       Action            Ubisoft     0.00     0.03     0.00
## 10627       DS 2007         Misc Namco Bandai Games     0.00     0.00     0.10
## 313        NES 1988 Role-Playing   Enix Corporation     0.10     0.00     3.77
##       Other_Sales Global_Sales
## 5129         0.01         0.37
## 6834         0.03         0.24
## 15748        0.00         0.02
## 14239        0.00         0.03
## 10627        0.00         0.10
## 313          0.00         3.87
head(df_4)
##        Rank                                     Name Platform Year        Genre
## 15925 15927       RollerCoaster Tycoon: Gold Edition       PC 2002     Strategy
## 11033 11034       Fullmetal Alchemist: Dual Sympathy       DS 2005 Role-Playing
## 14497 14499                          CIMA: The Enemy      GBA 2003 Role-Playing
## 7066   7067               Tetris 2 (weekly jp sales)       GB 1992       Puzzle
## 9794   9795 Atelier Annie: Alchemists of Sera Island       DS 2009 Role-Playing
## 2555   2556                              Dying Light     XOne 2015       Action
##                                    Publisher NA_Sales EU_Sales JP_Sales
## 15925                             Infogrames     0.01     0.00     0.00
## 11033                     Empire Interactive     0.05     0.00     0.04
## 14497                  Marvelous Interactive     0.02     0.01     0.00
## 7066                                Nintendo     0.00     0.00     0.23
## 9794                                    Gust     0.08     0.00     0.03
## 2555  Warner Bros. Interactive Entertainment     0.43     0.30     0.01
##       Other_Sales Global_Sales
## 15925        0.00         0.02
## 11033        0.00         0.09
## 14497        0.00         0.03
## 7066         0.00         0.23
## 9794         0.01         0.12
## 2555         0.07         0.81
head(df_5)
##        Rank                                                     Name Platform
## 12122 12123 Katekyoo Hitman Reborn! DS: Fate of Heat - Hono no Unmei       DS
## 9581   9582                                    Star Wars: Clone Wars       XB
## 13613 13614                        Emergency Room: Real Life Rescues       DS
## 258     258                           Call of Duty: Advanced Warfare     X360
## 4975   4976            Jam Sessions: Sing and Play Guitar (US sales)       DS
## 14679 14681                  The IdolM@ster: Gravure For You! Vol. 6      PS3
##       Year        Genre          Publisher NA_Sales EU_Sales JP_Sales
## 12122 2008 Role-Playing        Takara Tomy     0.00     0.00     0.07
## 9581  2003      Shooter          LucasArts     0.10     0.03     0.00
## 13613 2009   Simulation          505 Games     0.04     0.00     0.00
## 258   2014      Shooter         Activision     2.75     1.18     0.00
## 4975  2007         Misc            Ubisoft     0.38     0.00     0.00
## 14679 2012       Action Namco Bandai Games     0.00     0.00     0.03
##       Other_Sales Global_Sales
## 12122        0.00         0.07
## 9581         0.00         0.13
## 13613        0.00         0.04
## 258          0.37         4.31
## 4975         0.00         0.38
## 14679        0.00         0.03

Insight Gathered:

Similar to the original data set, the produced samples correctly captured both continuous (like sales data, release years) and categorical (like game genres, publishers) data. We were able to see how various subsets of data could differ by using sampling with replacement, which revealed the diversity and possible variability present across the dataset.

Significance:

Because unpredictable nature and variability are common in real-world data collection, this sampling procedure is significant. We may learn how statistical measures such as means, medians, and distributions may vary amongst subgroups by modelling this variability. This method forms the basis for bootstrapping, which assists in evaluating the precision and dependability of statistical estimations in order to guarantee that results are typical of the greater population rather than just sample-specific.

Further Questions for Investigation:

# Analyzing each subsample with group_by
analyze_subsamples_grouped <- function(subsample, subsample_number) {
  cat("\n--- Analysis for Subsample", subsample_number, "---\n")
  
  # using Group by for Genre and summarize sales
  genre_summary <- subsample %>%
    group_by(Genre) %>%
    summarise(
      Count = n(),
      Mean_Sales = mean(Global_Sales, na.rm = TRUE),
      Median_Sales = median(Global_Sales, na.rm = TRUE)
    )
  
  print(genre_summary)
  
  # Using Group by for Publisher and Genre
  publisher_genre_summary <- subsample %>%
    group_by(Publisher, Genre) %>%
    summarise(
      Count = n(),
      Total_Sales = sum(Global_Sales, na.rm = TRUE)
    ) %>%
    arrange(desc(Total_Sales))
  
  print("Publisher-Genre Summary:")
  print(publisher_genre_summary)
  
  return(list(genre_summary = genre_summary, publisher_genre_summary = publisher_genre_summary))
}

# Applying the function to each subsample
list_dfs = list(df_1, df_2, df_3, df_4, df_5)
results_list <- lapply(1:5, function(i) analyze_subsamples_grouped(list_dfs[[i]], i))
## 
## --- Analysis for Subsample 1 ---
## # A tibble: 12 × 4
##    Genre        Count Mean_Sales Median_Sales
##    <chr>        <int>      <dbl>        <dbl>
##  1 Action        1723      0.520        0.19 
##  2 Adventure      600      0.181        0.05 
##  3 Fighting       397      0.553        0.24 
##  4 Misc           893      0.523        0.17 
##  5 Platform       416      0.832        0.22 
##  6 Puzzle         309      0.350        0.11 
##  7 Racing         583      0.550        0.16 
##  8 Role-Playing   736      0.712        0.205
##  9 Shooter        647      0.893        0.24 
## 10 Simulation     440      0.383        0.13 
## 11 Sports        1226      0.532        0.21 
## 12 Strategy       329      0.239        0.09
## `summarise()` has grouped output by 'Publisher'. You can override using the
## `.groups` argument.
## [1] "Publisher-Genre Summary:"
## # A tibble: 1,184 × 4
## # Groups:   Publisher [407]
##    Publisher            Genre        Count Total_Sales
##    <chr>                <chr>        <int>       <dbl>
##  1 Electronic Arts      Sports         277       237. 
##  2 Activision           Shooter         69       198. 
##  3 Nintendo             Role-Playing    57       195. 
##  4 Nintendo             Platform        48       173. 
##  5 Take-Two Interactive Action          48       116. 
##  6 Nintendo             Misc            54       111. 
##  7 Activision           Action         167        92.6
##  8 Nintendo             Sports          32        83.6
##  9 Electronic Arts      Racing          78        81.4
## 10 Electronic Arts      Shooter         67        78.0
## # ℹ 1,174 more rows
## 
## --- Analysis for Subsample 2 ---
## # A tibble: 12 × 4
##    Genre        Count Mean_Sales Median_Sales
##    <chr>        <int>      <dbl>        <dbl>
##  1 Action        1655      0.560        0.19 
##  2 Adventure      620      0.194        0.05 
##  3 Fighting       455      0.511        0.2  
##  4 Misc           854      0.437        0.16 
##  5 Platform       443      1.06         0.24 
##  6 Puzzle         281      0.440        0.13 
##  7 Racing         676      0.492        0.195
##  8 Role-Playing   739      0.527        0.18 
##  9 Shooter        668      0.801        0.23 
## 10 Simulation     405      0.365        0.14 
## 11 Sports        1197      0.537        0.23 
## 12 Strategy       306      0.270        0.11
## `summarise()` has grouped output by 'Publisher'. You can override using the
## `.groups` argument.
## [1] "Publisher-Genre Summary:"
## # A tibble: 1,200 × 4
## # Groups:   Publisher [394]
##    Publisher            Genre        Count Total_Sales
##    <chr>                <chr>        <int>       <dbl>
##  1 Nintendo             Platform        55       262. 
##  2 Electronic Arts      Sports         288       247. 
##  3 Activision           Shooter         98       172. 
##  4 Take-Two Interactive Action          49       163. 
##  5 Nintendo             Role-Playing    47        90.9
##  6 Ubisoft              Action         113        86.9
##  7 Nintendo             Misc            53        79.3
##  8 Electronic Arts      Shooter         58        78.6
##  9 Activision           Action         143        74.1
## 10 Electronic Arts      Racing          93        70.8
## # ℹ 1,190 more rows
## 
## --- Analysis for Subsample 3 ---
## # A tibble: 12 × 4
##    Genre        Count Mean_Sales Median_Sales
##    <chr>        <int>      <dbl>        <dbl>
##  1 Action        1627      0.531        0.19 
##  2 Adventure      679      0.170        0.06 
##  3 Fighting       433      0.584        0.2  
##  4 Misc           894      0.529        0.17 
##  5 Platform       445      0.786        0.28 
##  6 Puzzle         306      0.395        0.09 
##  7 Racing         650      0.648        0.23 
##  8 Role-Playing   700      0.699        0.19 
##  9 Shooter        666      0.749        0.205
## 10 Simulation     412      0.459        0.135
## 11 Sports        1181      0.605        0.22 
## 12 Strategy       306      0.260        0.09
## `summarise()` has grouped output by 'Publisher'. You can override using the
## `.groups` argument.
## [1] "Publisher-Genre Summary:"
## # A tibble: 1,200 × 4
## # Groups:   Publisher [404]
##    Publisher                   Genre        Count Total_Sales
##    <chr>                       <chr>        <int>       <dbl>
##  1 Electronic Arts             Sports         320       282. 
##  2 Nintendo                    Role-Playing    59       189. 
##  3 Nintendo                    Platform        59       167. 
##  4 Activision                  Shooter         84       166. 
##  5 Nintendo                    Misc            44       124. 
##  6 Sony Computer Entertainment Racing          42       108. 
##  7 Nintendo                    Sports          26       102. 
##  8 Activision                  Action         151        92.1
##  9 Electronic Arts             Shooter         75        86.4
## 10 Take-Two Interactive        Action          41        80.9
## # ℹ 1,190 more rows
## 
## --- Analysis for Subsample 4 ---
## # A tibble: 12 × 4
##    Genre        Count Mean_Sales Median_Sales
##    <chr>        <int>      <dbl>        <dbl>
##  1 Action        1744      0.527         0.19
##  2 Adventure      663      0.211         0.06
##  3 Fighting       412      0.544         0.19
##  4 Misc           877      0.422         0.15
##  5 Platform       410      0.929         0.29
##  6 Puzzle         274      0.385         0.09
##  7 Racing         628      0.598         0.19
##  8 Role-Playing   733      0.556         0.18
##  9 Shooter        643      0.659         0.2 
## 10 Simulation     411      0.419         0.16
## 11 Sports        1161      0.546         0.23
## 12 Strategy       343      0.237         0.09
## `summarise()` has grouped output by 'Publisher'. You can override using the
## `.groups` argument.
## [1] "Publisher-Genre Summary:"
## # A tibble: 1,216 × 4
## # Groups:   Publisher [415]
##    Publisher                   Genre        Count Total_Sales
##    <chr>                       <chr>        <int>       <dbl>
##  1 Electronic Arts             Sports         282       251. 
##  2 Nintendo                    Platform        54       168. 
##  3 Activision                  Shooter         75       155. 
##  4 Take-Two Interactive        Action          50       148. 
##  5 Nintendo                    Role-Playing    55       123. 
##  6 Nintendo                    Sports          36        90.0
##  7 Activision                  Action         172        72.7
##  8 Sony Computer Entertainment Racing          32        71.7
##  9 Microsoft Game Studios      Misc            22        71.7
## 10 Nintendo                    Action          45        70.8
## # ℹ 1,206 more rows
## 
## --- Analysis for Subsample 5 ---
## # A tibble: 12 × 4
##    Genre        Count Mean_Sales Median_Sales
##    <chr>        <int>      <dbl>        <dbl>
##  1 Action        1666      0.512         0.2 
##  2 Adventure      677      0.195         0.06
##  3 Fighting       432      0.526         0.19
##  4 Misc           877      0.534         0.17
##  5 Platform       454      0.814         0.26
##  6 Puzzle         278      0.503         0.11
##  7 Racing         636      0.673         0.22
##  8 Role-Playing   789      0.550         0.19
##  9 Shooter        653      0.682         0.22
## 10 Simulation     435      0.492         0.14
## 11 Sports        1101      0.521         0.23
## 12 Strategy       301      0.289         0.11
## `summarise()` has grouped output by 'Publisher'. You can override using the
## `.groups` argument.
## [1] "Publisher-Genre Summary:"
## # A tibble: 1,210 × 4
## # Groups:   Publisher [421]
##    Publisher       Genre        Count Total_Sales
##    <chr>           <chr>        <int>       <dbl>
##  1 Electronic Arts Sports         266       226. 
##  2 Nintendo        Platform        47       156. 
##  3 Nintendo        Misc            52       110. 
##  4 Nintendo        Racing          21       110. 
##  5 Nintendo        Role-Playing    63       103. 
##  6 Activision      Shooter         74        95.7
##  7 Activision      Action         154        92.2
##  8 Electronic Arts Shooter         77        80.6
##  9 Electronic Arts Racing          86        78.9
## 10 Nintendo        Puzzle          33        74.9
## # ℹ 1,200 more rows
# Monte Carlo simulation function
perform_monte_carlo <- function(subsample, column_to_simulate, num_simulations = 1000) {
  simulated_means <- numeric(num_simulations)
  
  for (i in 1:num_simulations) {
    simulated_sample <- sample(subsample[[column_to_simulate]], replace = TRUE)
    simulated_means[i] <- mean(simulated_sample, na.rm = TRUE)
  }
  
  # Display the histogram of simulated means
  hist(simulated_means, main = paste("Simulation for", column_to_simulate, "Means"),
       xlab = "Simulated Means", col = "red")
  
  return(simulated_means)
}

# Applying simulation to all subsamples for "Global_Sales"
for (i in 1:5) {
  cat("\n--- Monte Carlo Simulation for Subsample", i, "---\n")
  perform_monte_carlo(list_dfs[[i]], "Global_Sales")
}
## 
## --- Monte Carlo Simulation for Subsample 1 ---

## 
## --- Monte Carlo Simulation for Subsample 2 ---

## 
## --- Monte Carlo Simulation for Subsample 3 ---

## 
## --- Monte Carlo Simulation for Subsample 4 ---

## 
## --- Monte Carlo Simulation for Subsample 5 ---

Insight Gathered:

Variability in sales across publishers and genres was found via subsample analysis. Certain genres exhibited continuous high sales across all subsamples, but others showed notable variation, indicating the dominance and stability of particular trends. Furthermore, the Monte Carlo simulation illustrated the range of average sales values for the “Global_Sales” variable, giving rise to a more comprehensive understanding of the dataset’s possible variability.

Significance:

This study emphasises how crucial it is to understand variability and uncertainty when analysing data. Reliability of trends is shown by constant patterns across subsamples on the other hand, variations highlight the possibility of incorrect inferences drawn from single samples. The Monte Carlo simulation reinforces the necessity to take data variability into account when forming conclusions by illustrating the range of possible outcomes.

Further Questions for Investigation: