data <- read.csv("C:\\Users\\gajaw\\OneDrive\\Desktop\\STATS\\vgsales.csv")

Creating new variables

Variable - Deviation Total Sales

data$deviation_total_sales = abs(mean(data$Global_Sales, na.rm = TRUE) - data$Global_Sales)
head(data,15)
##    Rank                        Name Platform Year        Genre Publisher
## 1     1                  Wii Sports      Wii 2006       Sports  Nintendo
## 2     2           Super Mario Bros.      NES 1985     Platform  Nintendo
## 3     3              Mario Kart Wii      Wii 2008       Racing  Nintendo
## 4     4           Wii Sports Resort      Wii 2009       Sports  Nintendo
## 5     5    Pokemon Red/Pokemon Blue       GB 1996 Role-Playing  Nintendo
## 6     6                      Tetris       GB 1989       Puzzle  Nintendo
## 7     7       New Super Mario Bros.       DS 2006     Platform  Nintendo
## 8     8                    Wii Play      Wii 2006         Misc  Nintendo
## 9     9   New Super Mario Bros. Wii      Wii 2009     Platform  Nintendo
## 10   10                   Duck Hunt      NES 1984      Shooter  Nintendo
## 11   11                  Nintendogs       DS 2005   Simulation  Nintendo
## 12   12               Mario Kart DS       DS 2005       Racing  Nintendo
## 13   13 Pokemon Gold/Pokemon Silver       GB 1999 Role-Playing  Nintendo
## 14   14                     Wii Fit      Wii 2007       Sports  Nintendo
## 15   15                Wii Fit Plus      Wii 2009       Sports  Nintendo
##    NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales deviation_total_sales
## 1     41.49    29.02     3.77        8.46        82.74              82.20256
## 2     29.08     3.58     6.81        0.77        40.24              39.70256
## 3     15.85    12.88     3.79        3.31        35.82              35.28256
## 4     15.75    11.01     3.28        2.96        33.00              32.46256
## 5     11.27     8.89    10.22        1.00        31.37              30.83256
## 6     23.20     2.26     4.22        0.58        30.26              29.72256
## 7     11.38     9.23     6.50        2.90        30.01              29.47256
## 8     14.03     9.20     2.93        2.85        29.02              28.48256
## 9     14.59     7.06     4.70        2.26        28.62              28.08256
## 10    26.93     0.63     0.28        0.47        28.31              27.77256
## 11     9.07    11.00     1.93        2.75        24.76              24.22256
## 12     9.81     7.57     4.13        1.92        23.42              22.88256
## 13     9.00     6.18     7.20        0.71        23.10              22.56256
## 14     8.94     8.03     3.60        2.15        22.72              22.18256
## 15     9.09     8.59     2.53        1.79        22.00              21.46256

Variable - Deviation Year

data$Year <- as.numeric(as.character(data$Year))
## Warning: NAs introduced by coercion
data$Year[is.na(data$Year)] <- mean(data$Year, na.rm = TRUE)

data$deviation_year = abs(mean(data$Year, na.rm = TRUE) - data$Year)

head(data, 15)
##    Rank                        Name Platform Year        Genre Publisher
## 1     1                  Wii Sports      Wii 2006       Sports  Nintendo
## 2     2           Super Mario Bros.      NES 1985     Platform  Nintendo
## 3     3              Mario Kart Wii      Wii 2008       Racing  Nintendo
## 4     4           Wii Sports Resort      Wii 2009       Sports  Nintendo
## 5     5    Pokemon Red/Pokemon Blue       GB 1996 Role-Playing  Nintendo
## 6     6                      Tetris       GB 1989       Puzzle  Nintendo
## 7     7       New Super Mario Bros.       DS 2006     Platform  Nintendo
## 8     8                    Wii Play      Wii 2006         Misc  Nintendo
## 9     9   New Super Mario Bros. Wii      Wii 2009     Platform  Nintendo
## 10   10                   Duck Hunt      NES 1984      Shooter  Nintendo
## 11   11                  Nintendogs       DS 2005   Simulation  Nintendo
## 12   12               Mario Kart DS       DS 2005       Racing  Nintendo
## 13   13 Pokemon Gold/Pokemon Silver       GB 1999 Role-Playing  Nintendo
## 14   14                     Wii Fit      Wii 2007       Sports  Nintendo
## 15   15                Wii Fit Plus      Wii 2009       Sports  Nintendo
##    NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales deviation_total_sales
## 1     41.49    29.02     3.77        8.46        82.74              82.20256
## 2     29.08     3.58     6.81        0.77        40.24              39.70256
## 3     15.85    12.88     3.79        3.31        35.82              35.28256
## 4     15.75    11.01     3.28        2.96        33.00              32.46256
## 5     11.27     8.89    10.22        1.00        31.37              30.83256
## 6     23.20     2.26     4.22        0.58        30.26              29.72256
## 7     11.38     9.23     6.50        2.90        30.01              29.47256
## 8     14.03     9.20     2.93        2.85        29.02              28.48256
## 9     14.59     7.06     4.70        2.26        28.62              28.08256
## 10    26.93     0.63     0.28        0.47        28.31              27.77256
## 11     9.07    11.00     1.93        2.75        24.76              24.22256
## 12     9.81     7.57     4.13        1.92        23.42              22.88256
## 13     9.00     6.18     7.20        0.71        23.10              22.56256
## 14     8.94     8.03     3.60        2.15        22.72              22.18256
## 15     9.09     8.59     2.53        1.79        22.00              21.46256
##    deviation_year
## 1       0.4064433
## 2      21.4064433
## 3       1.5935567
## 4       2.5935567
## 5      10.4064433
## 6      17.4064433
## 7       0.4064433
## 8       0.4064433
## 9       2.5935567
## 10     22.4064433
## 11      1.4064433
## 12      1.4064433
## 13      7.4064433
## 14      0.5935567
## 15      2.5935567

Visualization

Visualization for pair 1: Global Sales and Deviation Total Sales

library(ggplot2)
ggplot(data, aes(x = deviation_total_sales, y = Global_Sales)) +
  geom_point() +
  geom_smooth(method = "lm", col = "blue") +
  labs(title = "Global Sales vs. Deviation from Total Sales",
       x = "Deviation from Total Sales (in millions)",
       y = "Global Sales (in millions)")
## `geom_smooth()` using formula = 'y ~ x'

  • The plot shows a clear linear relationship between Global Sales and Deviation from Total Sales. This indicates that as the deviation from the total sales increases, global sales also increase, following a nearly perfect positive linear trend.

  • Outliers- At the far right of the plot, there is a noticeable outlier with a very high Deviation from Total Sales and corresponding high Global Sales. This point represents a video game that has significantly higher sales than most other games in the dataset. Most data points are clustered around lower deviation values, indicating that most games have sales figures close to the overall mean.

Visualization for pair 2: Year and Deviation Year

#Year vs. deviation_year
ggplot(data, aes(x = deviation_year, y = Year)) +
  geom_point() +
  geom_smooth(method = "lm", col = "red") +
  labs(title = "Year vs. Deviation from Mean Year",
       x = "Deviation from Mean Year",
       y = "Year")
## `geom_smooth()` using formula = 'y ~ x'

  • In terms of potential outliers, no extreme outliers are visually evident. The data points appear to follow a relatively consistent pattern along the negative slope, with some mild deviations from the line.

  • There is a negative linear trend in this plot, which is expected. The higher the deviation from the mean year, the further the data points are from the mean release year (which appears to be around 2005–2010).

  • This is a logical outcome because as the year moves away from the mean year (both in the past and future), the deviation increases symmetrically.
  • However, there is a slightly wider spread of points between 2010 and 2020. Some recent game release years (those after 2010) are scattered further from the regression line, which could indicate some irregularity in the release patterns in more recent years.

  • Similarly, there are a few points before 1985 that could be considered outliers compared to the rest of the data. These might represent very early video games that were rare or released at times when the video game industry was not as developed.

Calculating Correlation Coefficient

Correlation between Global Sales and Deviation from Total Sales

cor(data$Global_Sales, data$deviation_total_sales)
## [1] 0.9675341

This strong positive correlation makes perfect sense based on the plot. It indicates a nearly perfect linear relationship, meaning that as the deviation from total sales increases, global sales also increase almost proportionally. This high correlation reflects that games with larger sales deviations (especially blockbusters) dominate global sales numbers.

Correlation between Year and Deviation from Mean Year

cor(data$Year, data$deviation_year)
## [1] -0.3817814

The moderate negative correlation is consistent with the plot, showing a general downward trend but with some variability. While the relationship between Year and Deviation from Mean Year is negative, it’s not as strong due to the scattering of data points, particularly for recent years. This indicates that the deviation from the mean year doesn’t fully explain the variation in release years but still shows a meaningful trend.

Calculating Confidence Intervals

Confidence Interval for Global Sales:

#mean and standard error for Global Sales
mean_global_sales <- mean(data$Global_Sales, na.rm = TRUE)
std_error_global_sales <- sd(data$Global_Sales, na.rm = TRUE) / sqrt(nrow(data))

# Calculating the 95% confidence interval for Global Sales
ci_lower_global_sales <- mean_global_sales - qt(0.975, df=nrow(data)-1) * std_error_global_sales
ci_upper_global_sales <- mean_global_sales + qt(0.975, df=nrow(data)-1) * std_error_global_sales

#result
ci_lower_global_sales
## [1] 0.513782
ci_upper_global_sales
## [1] 0.5610993

Insight:

Significance:

Further Questions :

Confidence Interval for Year:

#mean and standard error for year
mean_year <- mean(data$Year, na.rm = TRUE)
std_error_year <- sd(data$Year, na.rm = TRUE) / sqrt(nrow(data))

# Calculating the 95% confidence interval for Year
ci_lower_year <- mean_year - qt(0.975, df=nrow(data)-1) * std_error_year
ci_upper_year <- mean_year + qt(0.975, df=nrow(data)-1) * std_error_year

#result
ci_lower_year
## [1] 2006.318
ci_upper_year
## [1] 2006.494

Insights:

Significance:

Further Questions: